| 1 | //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file defines a DAG pattern matching instruction selector for X86, |
| 10 | // converting from a legalized dag to a X86 dag. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "X86ISelDAGToDAG.h" |
| 15 | #include "X86.h" |
| 16 | #include "X86MachineFunctionInfo.h" |
| 17 | #include "X86Subtarget.h" |
| 18 | #include "X86TargetMachine.h" |
| 19 | #include "llvm/ADT/Statistic.h" |
| 20 | #include "llvm/CodeGen/MachineModuleInfo.h" |
| 21 | #include "llvm/CodeGen/SelectionDAGISel.h" |
| 22 | #include "llvm/Config/llvm-config.h" |
| 23 | #include "llvm/IR/ConstantRange.h" |
| 24 | #include "llvm/IR/Function.h" |
| 25 | #include "llvm/IR/Instructions.h" |
| 26 | #include "llvm/IR/Intrinsics.h" |
| 27 | #include "llvm/IR/IntrinsicsX86.h" |
| 28 | #include "llvm/IR/Module.h" |
| 29 | #include "llvm/IR/Type.h" |
| 30 | #include "llvm/Support/Debug.h" |
| 31 | #include "llvm/Support/ErrorHandling.h" |
| 32 | #include "llvm/Support/KnownBits.h" |
| 33 | #include "llvm/Support/MathExtras.h" |
| 34 | #include <cstdint> |
| 35 | |
| 36 | using namespace llvm; |
| 37 | |
| 38 | #define DEBUG_TYPE "x86-isel" |
| 39 | #define PASS_NAME "X86 DAG->DAG Instruction Selection" |
| 40 | |
| 41 | STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor" ); |
| 42 | |
| 43 | static cl::opt<bool> AndImmShrink("x86-and-imm-shrink" , cl::init(Val: true), |
| 44 | cl::desc("Enable setting constant bits to reduce size of mask immediates" ), |
| 45 | cl::Hidden); |
| 46 | |
| 47 | static cl::opt<bool> EnablePromoteAnyextLoad( |
| 48 | "x86-promote-anyext-load" , cl::init(Val: true), |
| 49 | cl::desc("Enable promoting aligned anyext load to wider load" ), cl::Hidden); |
| 50 | |
| 51 | extern cl::opt<bool> IndirectBranchTracking; |
| 52 | |
| 53 | //===----------------------------------------------------------------------===// |
| 54 | // Pattern Matcher Implementation |
| 55 | //===----------------------------------------------------------------------===// |
| 56 | |
| 57 | namespace { |
| 58 | /// This corresponds to X86AddressMode, but uses SDValue's instead of register |
| 59 | /// numbers for the leaves of the matched tree. |
| 60 | struct X86ISelAddressMode { |
| 61 | enum { |
| 62 | RegBase, |
| 63 | FrameIndexBase |
| 64 | } BaseType = RegBase; |
| 65 | |
| 66 | // This is really a union, discriminated by BaseType! |
| 67 | SDValue Base_Reg; |
| 68 | int Base_FrameIndex = 0; |
| 69 | |
| 70 | unsigned Scale = 1; |
| 71 | SDValue IndexReg; |
| 72 | int32_t Disp = 0; |
| 73 | SDValue Segment; |
| 74 | const GlobalValue *GV = nullptr; |
| 75 | const Constant *CP = nullptr; |
| 76 | const BlockAddress *BlockAddr = nullptr; |
| 77 | const char *ES = nullptr; |
| 78 | MCSymbol *MCSym = nullptr; |
| 79 | int JT = -1; |
| 80 | Align Alignment; // CP alignment. |
| 81 | unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_* |
| 82 | bool NegateIndex = false; |
| 83 | |
| 84 | X86ISelAddressMode() = default; |
| 85 | |
| 86 | bool hasSymbolicDisplacement() const { |
| 87 | return GV != nullptr || CP != nullptr || ES != nullptr || |
| 88 | MCSym != nullptr || JT != -1 || BlockAddr != nullptr; |
| 89 | } |
| 90 | |
| 91 | bool hasBaseOrIndexReg() const { |
| 92 | return BaseType == FrameIndexBase || |
| 93 | IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; |
| 94 | } |
| 95 | |
| 96 | /// Return true if this addressing mode is already RIP-relative. |
| 97 | bool isRIPRelative() const { |
| 98 | if (BaseType != RegBase) return false; |
| 99 | if (RegisterSDNode *RegNode = |
| 100 | dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode())) |
| 101 | return RegNode->getReg() == X86::RIP; |
| 102 | return false; |
| 103 | } |
| 104 | |
| 105 | void setBaseReg(SDValue Reg) { |
| 106 | BaseType = RegBase; |
| 107 | Base_Reg = Reg; |
| 108 | } |
| 109 | |
| 110 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 111 | void dump(SelectionDAG *DAG = nullptr) { |
| 112 | dbgs() << "X86ISelAddressMode " << this << '\n'; |
| 113 | dbgs() << "Base_Reg " ; |
| 114 | if (Base_Reg.getNode()) |
| 115 | Base_Reg.getNode()->dump(DAG); |
| 116 | else |
| 117 | dbgs() << "nul\n" ; |
| 118 | if (BaseType == FrameIndexBase) |
| 119 | dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; |
| 120 | dbgs() << " Scale " << Scale << '\n' |
| 121 | << "IndexReg " ; |
| 122 | if (NegateIndex) |
| 123 | dbgs() << "negate " ; |
| 124 | if (IndexReg.getNode()) |
| 125 | IndexReg.getNode()->dump(DAG); |
| 126 | else |
| 127 | dbgs() << "nul\n" ; |
| 128 | dbgs() << " Disp " << Disp << '\n' |
| 129 | << "GV " ; |
| 130 | if (GV) |
| 131 | GV->dump(); |
| 132 | else |
| 133 | dbgs() << "nul" ; |
| 134 | dbgs() << " CP " ; |
| 135 | if (CP) |
| 136 | CP->dump(); |
| 137 | else |
| 138 | dbgs() << "nul" ; |
| 139 | dbgs() << '\n' |
| 140 | << "ES " ; |
| 141 | if (ES) |
| 142 | dbgs() << ES; |
| 143 | else |
| 144 | dbgs() << "nul" ; |
| 145 | dbgs() << " MCSym " ; |
| 146 | if (MCSym) |
| 147 | dbgs() << MCSym; |
| 148 | else |
| 149 | dbgs() << "nul" ; |
| 150 | dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; |
| 151 | } |
| 152 | #endif |
| 153 | }; |
| 154 | } |
| 155 | |
| 156 | namespace { |
| 157 | //===--------------------------------------------------------------------===// |
| 158 | /// ISel - X86-specific code to select X86 machine instructions for |
| 159 | /// SelectionDAG operations. |
| 160 | /// |
| 161 | class X86DAGToDAGISel final : public SelectionDAGISel { |
| 162 | /// Keep a pointer to the X86Subtarget around so that we can |
| 163 | /// make the right decision when generating code for different targets. |
| 164 | const X86Subtarget *Subtarget; |
| 165 | |
| 166 | /// If true, selector should try to optimize for minimum code size. |
| 167 | bool OptForMinSize; |
| 168 | |
| 169 | /// Disable direct TLS access through segment registers. |
| 170 | bool IndirectTlsSegRefs; |
| 171 | |
| 172 | public: |
| 173 | X86DAGToDAGISel() = delete; |
| 174 | |
| 175 | explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel) |
| 176 | : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), |
| 177 | OptForMinSize(false), IndirectTlsSegRefs(false) {} |
| 178 | |
| 179 | bool runOnMachineFunction(MachineFunction &MF) override { |
| 180 | // Reset the subtarget each time through. |
| 181 | Subtarget = &MF.getSubtarget<X86Subtarget>(); |
| 182 | IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( |
| 183 | Kind: "indirect-tls-seg-refs" ); |
| 184 | |
| 185 | // OptFor[Min]Size are used in pattern predicates that isel is matching. |
| 186 | OptForMinSize = MF.getFunction().hasMinSize(); |
| 187 | return SelectionDAGISel::runOnMachineFunction(mf&: MF); |
| 188 | } |
| 189 | |
| 190 | void emitFunctionEntryCode() override; |
| 191 | |
| 192 | bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; |
| 193 | |
| 194 | void PreprocessISelDAG() override; |
| 195 | void PostprocessISelDAG() override; |
| 196 | |
| 197 | // Include the pieces autogenerated from the target description. |
| 198 | #include "X86GenDAGISel.inc" |
| 199 | |
| 200 | private: |
| 201 | void Select(SDNode *N) override; |
| 202 | |
| 203 | bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); |
| 204 | bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
| 205 | bool AllowSegmentRegForX32 = false); |
| 206 | bool matchWrapper(SDValue N, X86ISelAddressMode &AM); |
| 207 | bool matchAddress(SDValue N, X86ISelAddressMode &AM); |
| 208 | bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); |
| 209 | bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); |
| 210 | SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM, |
| 211 | unsigned Depth); |
| 212 | bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
| 213 | unsigned Depth); |
| 214 | bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
| 215 | unsigned Depth); |
| 216 | bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); |
| 217 | bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
| 218 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
| 219 | SDValue &Segment); |
| 220 | bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, |
| 221 | SDValue ScaleOp, SDValue &Base, SDValue &Scale, |
| 222 | SDValue &Index, SDValue &Disp, SDValue &Segment); |
| 223 | bool selectMOV64Imm32(SDValue N, SDValue &Imm); |
| 224 | bool selectLEAAddr(SDValue N, SDValue &Base, |
| 225 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
| 226 | SDValue &Segment); |
| 227 | bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale, |
| 228 | SDValue &Index, SDValue &Disp, SDValue &Segment); |
| 229 | bool selectTLSADDRAddr(SDValue N, SDValue &Base, |
| 230 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
| 231 | SDValue &Segment); |
| 232 | bool selectRelocImm(SDValue N, SDValue &Op); |
| 233 | |
| 234 | bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
| 235 | SDValue &Base, SDValue &Scale, |
| 236 | SDValue &Index, SDValue &Disp, |
| 237 | SDValue &Segment); |
| 238 | |
| 239 | // Convenience method where P is also root. |
| 240 | bool tryFoldLoad(SDNode *P, SDValue N, |
| 241 | SDValue &Base, SDValue &Scale, |
| 242 | SDValue &Index, SDValue &Disp, |
| 243 | SDValue &Segment) { |
| 244 | return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment); |
| 245 | } |
| 246 | |
| 247 | bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
| 248 | SDValue &Base, SDValue &Scale, |
| 249 | SDValue &Index, SDValue &Disp, |
| 250 | SDValue &Segment); |
| 251 | |
| 252 | bool isProfitableToFormMaskedOp(SDNode *N) const; |
| 253 | |
| 254 | /// Implement addressing mode selection for inline asm expressions. |
| 255 | bool SelectInlineAsmMemoryOperand(const SDValue &Op, |
| 256 | InlineAsm::ConstraintCode ConstraintID, |
| 257 | std::vector<SDValue> &OutOps) override; |
| 258 | |
| 259 | void emitSpecialCodeForMain(); |
| 260 | |
| 261 | inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, |
| 262 | MVT VT, SDValue &Base, SDValue &Scale, |
| 263 | SDValue &Index, SDValue &Disp, |
| 264 | SDValue &Segment) { |
| 265 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
| 266 | Base = CurDAG->getTargetFrameIndex( |
| 267 | FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout())); |
| 268 | else if (AM.Base_Reg.getNode()) |
| 269 | Base = AM.Base_Reg; |
| 270 | else |
| 271 | Base = CurDAG->getRegister(Reg: 0, VT); |
| 272 | |
| 273 | Scale = getI8Imm(Imm: AM.Scale, DL); |
| 274 | |
| 275 | #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC) |
| 276 | // Negate the index if needed. |
| 277 | if (AM.NegateIndex) { |
| 278 | unsigned NegOpc; |
| 279 | switch (VT.SimpleTy) { |
| 280 | default: |
| 281 | llvm_unreachable("Unsupported VT!" ); |
| 282 | case MVT::i64: |
| 283 | NegOpc = GET_ND_IF_ENABLED(X86::NEG64r); |
| 284 | break; |
| 285 | case MVT::i32: |
| 286 | NegOpc = GET_ND_IF_ENABLED(X86::NEG32r); |
| 287 | break; |
| 288 | case MVT::i16: |
| 289 | NegOpc = GET_ND_IF_ENABLED(X86::NEG16r); |
| 290 | break; |
| 291 | case MVT::i8: |
| 292 | NegOpc = GET_ND_IF_ENABLED(X86::NEG8r); |
| 293 | break; |
| 294 | } |
| 295 | SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32, |
| 296 | Ops: AM.IndexReg), 0); |
| 297 | AM.IndexReg = Neg; |
| 298 | } |
| 299 | |
| 300 | if (AM.IndexReg.getNode()) |
| 301 | Index = AM.IndexReg; |
| 302 | else |
| 303 | Index = CurDAG->getRegister(Reg: 0, VT); |
| 304 | |
| 305 | // These are 32-bit even in 64-bit mode since RIP-relative offset |
| 306 | // is 32-bit. |
| 307 | if (AM.GV) |
| 308 | Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(), |
| 309 | VT: MVT::i32, offset: AM.Disp, |
| 310 | TargetFlags: AM.SymbolFlags); |
| 311 | else if (AM.CP) |
| 312 | Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment, |
| 313 | Offset: AM.Disp, TargetFlags: AM.SymbolFlags); |
| 314 | else if (AM.ES) { |
| 315 | assert(!AM.Disp && "Non-zero displacement is ignored with ES." ); |
| 316 | Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags); |
| 317 | } else if (AM.MCSym) { |
| 318 | assert(!AM.Disp && "Non-zero displacement is ignored with MCSym." ); |
| 319 | assert(AM.SymbolFlags == 0 && "oo" ); |
| 320 | Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32); |
| 321 | } else if (AM.JT != -1) { |
| 322 | assert(!AM.Disp && "Non-zero displacement is ignored with JT." ); |
| 323 | Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags); |
| 324 | } else if (AM.BlockAddr) |
| 325 | Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp, |
| 326 | TargetFlags: AM.SymbolFlags); |
| 327 | else |
| 328 | Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32); |
| 329 | |
| 330 | if (AM.Segment.getNode()) |
| 331 | Segment = AM.Segment; |
| 332 | else |
| 333 | Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
| 334 | } |
| 335 | |
| 336 | // Utility function to determine whether it is AMX SDNode right after |
| 337 | // lowering but before ISEL. |
| 338 | bool isAMXSDNode(SDNode *N) const { |
| 339 | // Check if N is AMX SDNode: |
| 340 | // 1. check specific opcode since these carry MVT::Untyped instead of |
| 341 | // x86amx_type; |
| 342 | // 2. check result type; |
| 343 | // 3. check operand type; |
| 344 | switch (N->getOpcode()) { |
| 345 | default: |
| 346 | break; |
| 347 | case X86::PT2RPNTLVWZ0V: |
| 348 | case X86::PT2RPNTLVWZ0T1V: |
| 349 | case X86::PT2RPNTLVWZ1V: |
| 350 | case X86::PT2RPNTLVWZ1T1V: |
| 351 | case X86::PT2RPNTLVWZ0RSV: |
| 352 | case X86::PT2RPNTLVWZ0RST1V: |
| 353 | case X86::PT2RPNTLVWZ1RSV: |
| 354 | case X86::PT2RPNTLVWZ1RST1V: |
| 355 | return true; |
| 356 | } |
| 357 | for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { |
| 358 | if (N->getValueType(ResNo: Idx) == MVT::x86amx) |
| 359 | return true; |
| 360 | } |
| 361 | for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) { |
| 362 | SDValue Op = N->getOperand(Num: Idx); |
| 363 | if (Op.getValueType() == MVT::x86amx) |
| 364 | return true; |
| 365 | } |
| 366 | return false; |
| 367 | } |
| 368 | |
| 369 | // Utility function to determine whether we should avoid selecting |
| 370 | // immediate forms of instructions for better code size or not. |
| 371 | // At a high level, we'd like to avoid such instructions when |
| 372 | // we have similar constants used within the same basic block |
| 373 | // that can be kept in a register. |
| 374 | // |
| 375 | bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { |
| 376 | uint32_t UseCount = 0; |
| 377 | |
| 378 | // Do not want to hoist if we're not optimizing for size. |
| 379 | // TODO: We'd like to remove this restriction. |
| 380 | // See the comment in X86InstrInfo.td for more info. |
| 381 | if (!CurDAG->shouldOptForSize()) |
| 382 | return false; |
| 383 | |
| 384 | // Walk all the users of the immediate. |
| 385 | for (const SDNode *User : N->users()) { |
| 386 | if (UseCount >= 2) |
| 387 | break; |
| 388 | |
| 389 | // This user is already selected. Count it as a legitimate use and |
| 390 | // move on. |
| 391 | if (User->isMachineOpcode()) { |
| 392 | UseCount++; |
| 393 | continue; |
| 394 | } |
| 395 | |
| 396 | // We want to count stores of immediates as real uses. |
| 397 | if (User->getOpcode() == ISD::STORE && |
| 398 | User->getOperand(Num: 1).getNode() == N) { |
| 399 | UseCount++; |
| 400 | continue; |
| 401 | } |
| 402 | |
| 403 | // We don't currently match users that have > 2 operands (except |
| 404 | // for stores, which are handled above) |
| 405 | // Those instruction won't match in ISEL, for now, and would |
| 406 | // be counted incorrectly. |
| 407 | // This may change in the future as we add additional instruction |
| 408 | // types. |
| 409 | if (User->getNumOperands() != 2) |
| 410 | continue; |
| 411 | |
| 412 | // If this is a sign-extended 8-bit integer immediate used in an ALU |
| 413 | // instruction, there is probably an opcode encoding to save space. |
| 414 | auto *C = dyn_cast<ConstantSDNode>(Val: N); |
| 415 | if (C && isInt<8>(x: C->getSExtValue())) |
| 416 | continue; |
| 417 | |
| 418 | // Immediates that are used for offsets as part of stack |
| 419 | // manipulation should be left alone. These are typically |
| 420 | // used to indicate SP offsets for argument passing and |
| 421 | // will get pulled into stores/pushes (implicitly). |
| 422 | if (User->getOpcode() == X86ISD::ADD || |
| 423 | User->getOpcode() == ISD::ADD || |
| 424 | User->getOpcode() == X86ISD::SUB || |
| 425 | User->getOpcode() == ISD::SUB) { |
| 426 | |
| 427 | // Find the other operand of the add/sub. |
| 428 | SDValue OtherOp = User->getOperand(Num: 0); |
| 429 | if (OtherOp.getNode() == N) |
| 430 | OtherOp = User->getOperand(Num: 1); |
| 431 | |
| 432 | // Don't count if the other operand is SP. |
| 433 | RegisterSDNode *RegNode; |
| 434 | if (OtherOp->getOpcode() == ISD::CopyFromReg && |
| 435 | (RegNode = dyn_cast_or_null<RegisterSDNode>( |
| 436 | Val: OtherOp->getOperand(Num: 1).getNode()))) |
| 437 | if ((RegNode->getReg() == X86::ESP) || |
| 438 | (RegNode->getReg() == X86::RSP)) |
| 439 | continue; |
| 440 | } |
| 441 | |
| 442 | // ... otherwise, count this and move on. |
| 443 | UseCount++; |
| 444 | } |
| 445 | |
| 446 | // If we have more than 1 use, then recommend for hoisting. |
| 447 | return (UseCount > 1); |
| 448 | } |
| 449 | |
| 450 | /// Return a target constant with the specified value of type i8. |
| 451 | inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { |
| 452 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8); |
| 453 | } |
| 454 | |
| 455 | /// Return a target constant with the specified value, of type i32. |
| 456 | inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { |
| 457 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32); |
| 458 | } |
| 459 | |
| 460 | /// Return a target constant with the specified value, of type i64. |
| 461 | inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { |
| 462 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64); |
| 463 | } |
| 464 | |
| 465 | SDValue (SDNode *N, unsigned VecWidth, |
| 466 | const SDLoc &DL) { |
| 467 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
| 468 | uint64_t Index = N->getConstantOperandVal(Num: 1); |
| 469 | MVT VecVT = N->getOperand(Num: 0).getSimpleValueType(); |
| 470 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
| 471 | } |
| 472 | |
| 473 | SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, |
| 474 | const SDLoc &DL) { |
| 475 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
| 476 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
| 477 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
| 478 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
| 479 | } |
| 480 | |
| 481 | SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth, |
| 482 | const SDLoc &DL) { |
| 483 | assert(VecWidth == 128 && "Unexpected vector width" ); |
| 484 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
| 485 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
| 486 | uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth; |
| 487 | assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index" ); |
| 488 | // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub) |
| 489 | // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub) |
| 490 | return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL); |
| 491 | } |
| 492 | |
| 493 | SDValue getSBBZero(SDNode *N) { |
| 494 | SDLoc dl(N); |
| 495 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 496 | |
| 497 | // Create zero. |
| 498 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32); |
| 499 | SDValue Zero = |
| 500 | SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0); |
| 501 | if (VT == MVT::i64) { |
| 502 | Zero = SDValue( |
| 503 | CurDAG->getMachineNode( |
| 504 | Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, |
| 505 | Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: Zero, |
| 506 | Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)), |
| 507 | 0); |
| 508 | } |
| 509 | |
| 510 | // Copy flags to the EFLAGS register and glue it to next node. |
| 511 | unsigned Opcode = N->getOpcode(); |
| 512 | assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) && |
| 513 | "Unexpected opcode for SBB materialization" ); |
| 514 | unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; |
| 515 | SDValue EFLAGS = |
| 516 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS, |
| 517 | N: N->getOperand(Num: FlagOpIndex), Glue: SDValue()); |
| 518 | |
| 519 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
| 520 | // 32-bit version. |
| 521 | unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; |
| 522 | MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
| 523 | VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32); |
| 524 | return SDValue( |
| 525 | CurDAG->getMachineNode(Opcode: Opc, dl, VTs, |
| 526 | Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}), |
| 527 | 0); |
| 528 | } |
| 529 | |
| 530 | // Helper to detect unneeded and instructions on shift amounts. Called |
| 531 | // from PatFrags in tablegen. |
| 532 | bool isUnneededShiftMask(SDNode *N, unsigned Width) const { |
| 533 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode" ); |
| 534 | const APInt &Val = N->getConstantOperandAPInt(Num: 1); |
| 535 | |
| 536 | if (Val.countr_one() >= Width) |
| 537 | return true; |
| 538 | |
| 539 | APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero; |
| 540 | return Mask.countr_one() >= Width; |
| 541 | } |
| 542 | |
| 543 | /// Return an SDNode that returns the value of the global base register. |
| 544 | /// Output instructions required to initialize the global base register, |
| 545 | /// if necessary. |
| 546 | SDNode *getGlobalBaseReg(); |
| 547 | |
| 548 | /// Return a reference to the TargetMachine, casted to the target-specific |
| 549 | /// type. |
| 550 | const X86TargetMachine &getTargetMachine() const { |
| 551 | return static_cast<const X86TargetMachine &>(TM); |
| 552 | } |
| 553 | |
| 554 | /// Return a reference to the TargetInstrInfo, casted to the target-specific |
| 555 | /// type. |
| 556 | const X86InstrInfo *getInstrInfo() const { |
| 557 | return Subtarget->getInstrInfo(); |
| 558 | } |
| 559 | |
| 560 | /// Return a condition code of the given SDNode |
| 561 | X86::CondCode getCondFromNode(SDNode *N) const; |
| 562 | |
| 563 | /// Address-mode matching performs shift-of-and to and-of-shift |
| 564 | /// reassociation in order to expose more scaled addressing |
| 565 | /// opportunities. |
| 566 | bool ComplexPatternFuncMutatesDAG() const override { |
| 567 | return true; |
| 568 | } |
| 569 | |
| 570 | bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; |
| 571 | |
| 572 | // Indicates we should prefer to use a non-temporal load for this load. |
| 573 | bool useNonTemporalLoad(LoadSDNode *N) const { |
| 574 | if (!N->isNonTemporal()) |
| 575 | return false; |
| 576 | |
| 577 | unsigned StoreSize = N->getMemoryVT().getStoreSize(); |
| 578 | |
| 579 | if (N->getAlign().value() < StoreSize) |
| 580 | return false; |
| 581 | |
| 582 | switch (StoreSize) { |
| 583 | default: llvm_unreachable("Unsupported store size" ); |
| 584 | case 4: |
| 585 | case 8: |
| 586 | return false; |
| 587 | case 16: |
| 588 | return Subtarget->hasSSE41(); |
| 589 | case 32: |
| 590 | return Subtarget->hasAVX2(); |
| 591 | case 64: |
| 592 | return Subtarget->hasAVX512(); |
| 593 | } |
| 594 | } |
| 595 | |
| 596 | bool foldLoadStoreIntoMemOperand(SDNode *Node); |
| 597 | MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); |
| 598 | bool matchBitExtract(SDNode *Node); |
| 599 | bool shrinkAndImmediate(SDNode *N); |
| 600 | bool isMaskZeroExtended(SDNode *N) const; |
| 601 | bool tryShiftAmountMod(SDNode *N); |
| 602 | bool tryShrinkShlLogicImm(SDNode *N); |
| 603 | bool tryVPTERNLOG(SDNode *N); |
| 604 | bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB, |
| 605 | SDNode *ParentC, SDValue A, SDValue B, SDValue C, |
| 606 | uint8_t Imm); |
| 607 | bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); |
| 608 | bool tryMatchBitSelect(SDNode *N); |
| 609 | |
| 610 | MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
| 611 | const SDLoc &dl, MVT VT, SDNode *Node); |
| 612 | MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
| 613 | const SDLoc &dl, MVT VT, SDNode *Node, |
| 614 | SDValue &InGlue); |
| 615 | |
| 616 | bool tryOptimizeRem8Extend(SDNode *N); |
| 617 | |
| 618 | bool onlyUsesZeroFlag(SDValue Flags) const; |
| 619 | bool hasNoSignFlagUses(SDValue Flags) const; |
| 620 | bool hasNoCarryFlagUses(SDValue Flags) const; |
| 621 | }; |
| 622 | |
| 623 | class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy { |
| 624 | public: |
| 625 | static char ID; |
| 626 | explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm, |
| 627 | CodeGenOptLevel OptLevel) |
| 628 | : SelectionDAGISelLegacy( |
| 629 | ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {} |
| 630 | }; |
| 631 | } |
| 632 | |
| 633 | char X86DAGToDAGISelLegacy::ID = 0; |
| 634 | |
| 635 | INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) |
| 636 | |
| 637 | // Returns true if this masked compare can be implemented legally with this |
| 638 | // type. |
| 639 | static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { |
| 640 | unsigned Opcode = N->getOpcode(); |
| 641 | if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || |
| 642 | Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || |
| 643 | Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { |
| 644 | // We can get 256-bit 8 element types here without VLX being enabled. When |
| 645 | // this happens we will use 512-bit operations and the mask will not be |
| 646 | // zero extended. |
| 647 | EVT OpVT = N->getOperand(Num: 0).getValueType(); |
| 648 | // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the |
| 649 | // second operand. |
| 650 | if (Opcode == X86ISD::STRICT_CMPM) |
| 651 | OpVT = N->getOperand(Num: 1).getValueType(); |
| 652 | if (OpVT.is256BitVector() || OpVT.is128BitVector()) |
| 653 | return Subtarget->hasVLX(); |
| 654 | |
| 655 | return true; |
| 656 | } |
| 657 | // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. |
| 658 | if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || |
| 659 | Opcode == X86ISD::FSETCCM_SAE) |
| 660 | return true; |
| 661 | |
| 662 | return false; |
| 663 | } |
| 664 | |
| 665 | // Returns true if we can assume the writer of the mask has zero extended it |
| 666 | // for us. |
| 667 | bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { |
| 668 | // If this is an AND, check if we have a compare on either side. As long as |
| 669 | // one side guarantees the mask is zero extended, the AND will preserve those |
| 670 | // zeros. |
| 671 | if (N->getOpcode() == ISD::AND) |
| 672 | return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) || |
| 673 | isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget); |
| 674 | |
| 675 | return isLegalMaskCompare(N, Subtarget); |
| 676 | } |
| 677 | |
| 678 | bool |
| 679 | X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { |
| 680 | if (OptLevel == CodeGenOptLevel::None) |
| 681 | return false; |
| 682 | |
| 683 | if (!N.hasOneUse()) |
| 684 | return false; |
| 685 | |
| 686 | if (N.getOpcode() != ISD::LOAD) |
| 687 | return true; |
| 688 | |
| 689 | // Don't fold non-temporal loads if we have an instruction for them. |
| 690 | if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N))) |
| 691 | return false; |
| 692 | |
| 693 | // If N is a load, do additional profitability checks. |
| 694 | if (U == Root) { |
| 695 | switch (U->getOpcode()) { |
| 696 | default: break; |
| 697 | case X86ISD::ADD: |
| 698 | case X86ISD::ADC: |
| 699 | case X86ISD::SUB: |
| 700 | case X86ISD::SBB: |
| 701 | case X86ISD::AND: |
| 702 | case X86ISD::XOR: |
| 703 | case X86ISD::OR: |
| 704 | case ISD::ADD: |
| 705 | case ISD::UADDO_CARRY: |
| 706 | case ISD::AND: |
| 707 | case ISD::OR: |
| 708 | case ISD::XOR: { |
| 709 | SDValue Op1 = U->getOperand(Num: 1); |
| 710 | |
| 711 | // If the other operand is a 8-bit immediate we should fold the immediate |
| 712 | // instead. This reduces code size. |
| 713 | // e.g. |
| 714 | // movl 4(%esp), %eax |
| 715 | // addl $4, %eax |
| 716 | // vs. |
| 717 | // movl $4, %eax |
| 718 | // addl 4(%esp), %eax |
| 719 | // The former is 2 bytes shorter. In case where the increment is 1, then |
| 720 | // the saving can be 4 bytes (by using incl %eax). |
| 721 | if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) { |
| 722 | if (Imm->getAPIntValue().isSignedIntN(N: 8)) |
| 723 | return false; |
| 724 | |
| 725 | // If this is a 64-bit AND with an immediate that fits in 32-bits, |
| 726 | // prefer using the smaller and over folding the load. This is needed to |
| 727 | // make sure immediates created by shrinkAndImmediate are always folded. |
| 728 | // Ideally we would narrow the load during DAG combine and get the |
| 729 | // best of both worlds. |
| 730 | if (U->getOpcode() == ISD::AND && |
| 731 | Imm->getAPIntValue().getBitWidth() == 64 && |
| 732 | Imm->getAPIntValue().isIntN(N: 32)) |
| 733 | return false; |
| 734 | |
| 735 | // If this really a zext_inreg that can be represented with a movzx |
| 736 | // instruction, prefer that. |
| 737 | // TODO: We could shrink the load and fold if it is non-volatile. |
| 738 | if (U->getOpcode() == ISD::AND && |
| 739 | (Imm->getAPIntValue() == UINT8_MAX || |
| 740 | Imm->getAPIntValue() == UINT16_MAX || |
| 741 | Imm->getAPIntValue() == UINT32_MAX)) |
| 742 | return false; |
| 743 | |
| 744 | // ADD/SUB with can negate the immediate and use the opposite operation |
| 745 | // to fit 128 into a sign extended 8 bit immediate. |
| 746 | if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && |
| 747 | (-Imm->getAPIntValue()).isSignedIntN(N: 8)) |
| 748 | return false; |
| 749 | |
| 750 | if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && |
| 751 | (-Imm->getAPIntValue()).isSignedIntN(N: 8) && |
| 752 | hasNoCarryFlagUses(Flags: SDValue(U, 1))) |
| 753 | return false; |
| 754 | } |
| 755 | |
| 756 | // If the other operand is a TLS address, we should fold it instead. |
| 757 | // This produces |
| 758 | // movl %gs:0, %eax |
| 759 | // leal i@NTPOFF(%eax), %eax |
| 760 | // instead of |
| 761 | // movl $i@NTPOFF, %eax |
| 762 | // addl %gs:0, %eax |
| 763 | // if the block also has an access to a second TLS address this will save |
| 764 | // a load. |
| 765 | // FIXME: This is probably also true for non-TLS addresses. |
| 766 | if (Op1.getOpcode() == X86ISD::Wrapper) { |
| 767 | SDValue Val = Op1.getOperand(i: 0); |
| 768 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
| 769 | return false; |
| 770 | } |
| 771 | |
| 772 | // Don't fold load if this matches the BTS/BTR/BTC patterns. |
| 773 | // BTS: (or X, (shl 1, n)) |
| 774 | // BTR: (and X, (rotl -2, n)) |
| 775 | // BTC: (xor X, (shl 1, n)) |
| 776 | if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { |
| 777 | if (U->getOperand(Num: 0).getOpcode() == ISD::SHL && |
| 778 | isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0))) |
| 779 | return false; |
| 780 | |
| 781 | if (U->getOperand(Num: 1).getOpcode() == ISD::SHL && |
| 782 | isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0))) |
| 783 | return false; |
| 784 | } |
| 785 | if (U->getOpcode() == ISD::AND) { |
| 786 | SDValue U0 = U->getOperand(Num: 0); |
| 787 | SDValue U1 = U->getOperand(Num: 1); |
| 788 | if (U0.getOpcode() == ISD::ROTL) { |
| 789 | auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0)); |
| 790 | if (C && C->getSExtValue() == -2) |
| 791 | return false; |
| 792 | } |
| 793 | |
| 794 | if (U1.getOpcode() == ISD::ROTL) { |
| 795 | auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0)); |
| 796 | if (C && C->getSExtValue() == -2) |
| 797 | return false; |
| 798 | } |
| 799 | } |
| 800 | |
| 801 | break; |
| 802 | } |
| 803 | case ISD::SHL: |
| 804 | case ISD::SRA: |
| 805 | case ISD::SRL: |
| 806 | // Don't fold a load into a shift by immediate. The BMI2 instructions |
| 807 | // support folding a load, but not an immediate. The legacy instructions |
| 808 | // support folding an immediate, but can't fold a load. Folding an |
| 809 | // immediate is preferable to folding a load. |
| 810 | if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1))) |
| 811 | return false; |
| 812 | |
| 813 | break; |
| 814 | } |
| 815 | } |
| 816 | |
| 817 | // Prevent folding a load if this can implemented with an insert_subreg or |
| 818 | // a move that implicitly zeroes. |
| 819 | if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && |
| 820 | isNullConstant(V: Root->getOperand(Num: 2)) && |
| 821 | (Root->getOperand(Num: 0).isUndef() || |
| 822 | ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode()))) |
| 823 | return false; |
| 824 | |
| 825 | return true; |
| 826 | } |
| 827 | |
| 828 | // Indicates it is profitable to form an AVX512 masked operation. Returning |
| 829 | // false will favor a masked register-register masked move or vblendm and the |
| 830 | // operation will be selected separately. |
| 831 | bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { |
| 832 | assert( |
| 833 | (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && |
| 834 | "Unexpected opcode!" ); |
| 835 | |
| 836 | // If the operation has additional users, the operation will be duplicated. |
| 837 | // Check the use count to prevent that. |
| 838 | // FIXME: Are there cheap opcodes we might want to duplicate? |
| 839 | return N->getOperand(Num: 1).hasOneUse(); |
| 840 | } |
| 841 | |
| 842 | /// Replace the original chain operand of the call with |
| 843 | /// load's chain operand and move load below the call's chain operand. |
| 844 | static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, |
| 845 | SDValue Call, SDValue OrigChain) { |
| 846 | SmallVector<SDValue, 8> Ops; |
| 847 | SDValue Chain = OrigChain.getOperand(i: 0); |
| 848 | if (Chain.getNode() == Load.getNode()) |
| 849 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
| 850 | else { |
| 851 | assert(Chain.getOpcode() == ISD::TokenFactor && |
| 852 | "Unexpected chain operand" ); |
| 853 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) |
| 854 | if (Chain.getOperand(i).getNode() == Load.getNode()) |
| 855 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
| 856 | else |
| 857 | Ops.push_back(Elt: Chain.getOperand(i)); |
| 858 | SDValue NewChain = |
| 859 | CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops); |
| 860 | Ops.clear(); |
| 861 | Ops.push_back(Elt: NewChain); |
| 862 | } |
| 863 | Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end()); |
| 864 | CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops); |
| 865 | CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0), |
| 866 | Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2)); |
| 867 | |
| 868 | Ops.clear(); |
| 869 | Ops.push_back(Elt: SDValue(Load.getNode(), 1)); |
| 870 | Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end()); |
| 871 | CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops); |
| 872 | } |
| 873 | |
| 874 | /// Return true if call address is a load and it can be |
| 875 | /// moved below CALLSEQ_START and the chains leading up to the call. |
| 876 | /// Return the CALLSEQ_START by reference as a second output. |
| 877 | /// In the case of a tail call, there isn't a callseq node between the call |
| 878 | /// chain and the load. |
| 879 | static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { |
| 880 | // The transformation is somewhat dangerous if the call's chain was glued to |
| 881 | // the call. After MoveBelowOrigChain the load is moved between the call and |
| 882 | // the chain, this can create a cycle if the load is not folded. So it is |
| 883 | // *really* important that we are sure the load will be folded. |
| 884 | if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) |
| 885 | return false; |
| 886 | auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode()); |
| 887 | if (!LD || |
| 888 | !LD->isSimple() || |
| 889 | LD->getAddressingMode() != ISD::UNINDEXED || |
| 890 | LD->getExtensionType() != ISD::NON_EXTLOAD) |
| 891 | return false; |
| 892 | |
| 893 | // Now let's find the callseq_start. |
| 894 | while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { |
| 895 | if (!Chain.hasOneUse()) |
| 896 | return false; |
| 897 | Chain = Chain.getOperand(i: 0); |
| 898 | } |
| 899 | |
| 900 | if (!Chain.getNumOperands()) |
| 901 | return false; |
| 902 | // Since we are not checking for AA here, conservatively abort if the chain |
| 903 | // writes to memory. It's not safe to move the callee (a load) across a store. |
| 904 | if (isa<MemSDNode>(Val: Chain.getNode()) && |
| 905 | cast<MemSDNode>(Val: Chain.getNode())->writeMem()) |
| 906 | return false; |
| 907 | if (Chain.getOperand(i: 0).getNode() == Callee.getNode()) |
| 908 | return true; |
| 909 | if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor && |
| 910 | Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) && |
| 911 | Callee.getValue(R: 1).hasOneUse()) |
| 912 | return true; |
| 913 | return false; |
| 914 | } |
| 915 | |
| 916 | static bool isEndbrImm64(uint64_t Imm) { |
| 917 | // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. |
| 918 | // i.g: 0xF3660F1EFA, 0xF3670F1EFA |
| 919 | if ((Imm & 0x00FFFFFF) != 0x0F1EFA) |
| 920 | return false; |
| 921 | |
| 922 | uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, |
| 923 | 0x65, 0x66, 0x67, 0xf0, 0xf2}; |
| 924 | int i = 24; // 24bit 0x0F1EFA has matched |
| 925 | while (i < 64) { |
| 926 | uint8_t Byte = (Imm >> i) & 0xFF; |
| 927 | if (Byte == 0xF3) |
| 928 | return true; |
| 929 | if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte)) |
| 930 | return false; |
| 931 | i += 8; |
| 932 | } |
| 933 | |
| 934 | return false; |
| 935 | } |
| 936 | |
| 937 | static bool needBWI(MVT VT) { |
| 938 | return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8); |
| 939 | } |
| 940 | |
| 941 | void X86DAGToDAGISel::PreprocessISelDAG() { |
| 942 | bool MadeChange = false; |
| 943 | for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), |
| 944 | E = CurDAG->allnodes_end(); I != E; ) { |
| 945 | SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. |
| 946 | |
| 947 | // This is for CET enhancement. |
| 948 | // |
| 949 | // ENDBR32 and ENDBR64 have specific opcodes: |
| 950 | // ENDBR32: F3 0F 1E FB |
| 951 | // ENDBR64: F3 0F 1E FA |
| 952 | // And we want that attackers won’t find unintended ENDBR32/64 |
| 953 | // opcode matches in the binary |
| 954 | // Here’s an example: |
| 955 | // If the compiler had to generate asm for the following code: |
| 956 | // a = 0xF30F1EFA |
| 957 | // it could, for example, generate: |
| 958 | // mov 0xF30F1EFA, dword ptr[a] |
| 959 | // In such a case, the binary would include a gadget that starts |
| 960 | // with a fake ENDBR64 opcode. Therefore, we split such generation |
| 961 | // into multiple operations, let it not shows in the binary |
| 962 | if (N->getOpcode() == ISD::Constant) { |
| 963 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 964 | int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue(); |
| 965 | int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; |
| 966 | if (Imm == EndbrImm || isEndbrImm64(Imm)) { |
| 967 | // Check that the cf-protection-branch is enabled. |
| 968 | Metadata *CFProtectionBranch = |
| 969 | MF->getFunction().getParent()->getModuleFlag( |
| 970 | Key: "cf-protection-branch" ); |
| 971 | if (CFProtectionBranch || IndirectBranchTracking) { |
| 972 | SDLoc dl(N); |
| 973 | SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true); |
| 974 | Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT); |
| 975 | --I; |
| 976 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement); |
| 977 | ++I; |
| 978 | MadeChange = true; |
| 979 | continue; |
| 980 | } |
| 981 | } |
| 982 | } |
| 983 | |
| 984 | // If this is a target specific AND node with no flag usages, turn it back |
| 985 | // into ISD::AND to enable test instruction matching. |
| 986 | if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) { |
| 987 | SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
| 988 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
| 989 | --I; |
| 990 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
| 991 | ++I; |
| 992 | MadeChange = true; |
| 993 | continue; |
| 994 | } |
| 995 | |
| 996 | // Convert vector increment or decrement to sub/add with an all-ones |
| 997 | // constant: |
| 998 | // add X, <1, 1...> --> sub X, <-1, -1...> |
| 999 | // sub X, <1, 1...> --> add X, <-1, -1...> |
| 1000 | // The all-ones vector constant can be materialized using a pcmpeq |
| 1001 | // instruction that is commonly recognized as an idiom (has no register |
| 1002 | // dependency), so that's better/smaller than loading a splat 1 constant. |
| 1003 | // |
| 1004 | // But don't do this if it would inhibit a potentially profitable load |
| 1005 | // folding opportunity for the other operand. That only occurs with the |
| 1006 | // intersection of: |
| 1007 | // (1) The other operand (op0) is load foldable. |
| 1008 | // (2) The op is an add (otherwise, we are *creating* an add and can still |
| 1009 | // load fold the other op). |
| 1010 | // (3) The target has AVX (otherwise, we have a destructive add and can't |
| 1011 | // load fold the other op without killing the constant op). |
| 1012 | // (4) The constant 1 vector has multiple uses (so it is profitable to load |
| 1013 | // into a register anyway). |
| 1014 | auto mayPreventLoadFold = [&]() { |
| 1015 | return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) && |
| 1016 | N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && |
| 1017 | !N->getOperand(Num: 1).hasOneUse(); |
| 1018 | }; |
| 1019 | if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && |
| 1020 | N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) { |
| 1021 | APInt SplatVal; |
| 1022 | if (X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) && |
| 1023 | SplatVal.isOne()) { |
| 1024 | SDLoc DL(N); |
| 1025 | |
| 1026 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 1027 | unsigned NumElts = VT.getSizeInBits() / 32; |
| 1028 | SDValue AllOnes = |
| 1029 | CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts)); |
| 1030 | AllOnes = CurDAG->getBitcast(VT, V: AllOnes); |
| 1031 | |
| 1032 | unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; |
| 1033 | SDValue Res = |
| 1034 | CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes); |
| 1035 | --I; |
| 1036 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
| 1037 | ++I; |
| 1038 | MadeChange = true; |
| 1039 | continue; |
| 1040 | } |
| 1041 | } |
| 1042 | |
| 1043 | switch (N->getOpcode()) { |
| 1044 | case X86ISD::VBROADCAST: { |
| 1045 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 1046 | // Emulate v32i16/v64i8 broadcast without BWI. |
| 1047 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
| 1048 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
| 1049 | SDLoc dl(N); |
| 1050 | SDValue NarrowBCast = |
| 1051 | CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0)); |
| 1052 | SDValue Res = |
| 1053 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
| 1054 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
| 1055 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
| 1056 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
| 1057 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
| 1058 | |
| 1059 | --I; |
| 1060 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
| 1061 | ++I; |
| 1062 | MadeChange = true; |
| 1063 | continue; |
| 1064 | } |
| 1065 | |
| 1066 | break; |
| 1067 | } |
| 1068 | case X86ISD::VBROADCAST_LOAD: { |
| 1069 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 1070 | // Emulate v32i16/v64i8 broadcast without BWI. |
| 1071 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
| 1072 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
| 1073 | auto *MemNode = cast<MemSDNode>(Val: N); |
| 1074 | SDLoc dl(N); |
| 1075 | SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other); |
| 1076 | SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; |
| 1077 | SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( |
| 1078 | Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(), |
| 1079 | MMO: MemNode->getMemOperand()); |
| 1080 | SDValue Res = |
| 1081 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
| 1082 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
| 1083 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
| 1084 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
| 1085 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
| 1086 | |
| 1087 | --I; |
| 1088 | SDValue To[] = {Res, NarrowBCast.getValue(R: 1)}; |
| 1089 | CurDAG->ReplaceAllUsesWith(From: N, To); |
| 1090 | ++I; |
| 1091 | MadeChange = true; |
| 1092 | continue; |
| 1093 | } |
| 1094 | |
| 1095 | break; |
| 1096 | } |
| 1097 | case ISD::LOAD: { |
| 1098 | // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM |
| 1099 | // load, then just extract the lower subvector and avoid the second load. |
| 1100 | auto *Ld = cast<LoadSDNode>(Val: N); |
| 1101 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 1102 | if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() || |
| 1103 | !(VT.is128BitVector() || VT.is256BitVector())) |
| 1104 | break; |
| 1105 | |
| 1106 | MVT MaxVT = VT; |
| 1107 | SDNode *MaxLd = nullptr; |
| 1108 | SDValue Ptr = Ld->getBasePtr(); |
| 1109 | SDValue Chain = Ld->getChain(); |
| 1110 | for (SDNode *User : Ptr->users()) { |
| 1111 | auto *UserLd = dyn_cast<LoadSDNode>(Val: User); |
| 1112 | MVT UserVT = User->getSimpleValueType(ResNo: 0); |
| 1113 | if (User != N && UserLd && ISD::isNormalLoad(N: User) && |
| 1114 | UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain && |
| 1115 | !User->hasAnyUseOfValue(Value: 1) && |
| 1116 | (UserVT.is256BitVector() || UserVT.is512BitVector()) && |
| 1117 | UserVT.getSizeInBits() > VT.getSizeInBits() && |
| 1118 | (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) { |
| 1119 | MaxLd = User; |
| 1120 | MaxVT = UserVT; |
| 1121 | } |
| 1122 | } |
| 1123 | if (MaxLd) { |
| 1124 | SDLoc dl(N); |
| 1125 | unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits(); |
| 1126 | MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts); |
| 1127 | SDValue = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT, |
| 1128 | N1: SDValue(MaxLd, 0), |
| 1129 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
| 1130 | SDValue Res = CurDAG->getBitcast(VT, V: Extract); |
| 1131 | |
| 1132 | --I; |
| 1133 | SDValue To[] = {Res, SDValue(MaxLd, 1)}; |
| 1134 | CurDAG->ReplaceAllUsesWith(From: N, To); |
| 1135 | ++I; |
| 1136 | MadeChange = true; |
| 1137 | continue; |
| 1138 | } |
| 1139 | break; |
| 1140 | } |
| 1141 | case ISD::VSELECT: { |
| 1142 | // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG. |
| 1143 | EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType(); |
| 1144 | if (EleVT == MVT::i1) |
| 1145 | break; |
| 1146 | |
| 1147 | assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!" ); |
| 1148 | assert(N->getValueType(0).getVectorElementType() != MVT::i16 && |
| 1149 | "We can't replace VSELECT with BLENDV in vXi16!" ); |
| 1150 | SDValue R; |
| 1151 | if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) == |
| 1152 | EleVT.getSizeInBits()) { |
| 1153 | R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
| 1154 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2), |
| 1155 | N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8)); |
| 1156 | } else { |
| 1157 | R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
| 1158 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), |
| 1159 | N3: N->getOperand(Num: 2)); |
| 1160 | } |
| 1161 | --I; |
| 1162 | CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode()); |
| 1163 | ++I; |
| 1164 | MadeChange = true; |
| 1165 | continue; |
| 1166 | } |
| 1167 | case ISD::FP_ROUND: |
| 1168 | case ISD::STRICT_FP_ROUND: |
| 1169 | case ISD::FP_TO_SINT: |
| 1170 | case ISD::FP_TO_UINT: |
| 1171 | case ISD::STRICT_FP_TO_SINT: |
| 1172 | case ISD::STRICT_FP_TO_UINT: { |
| 1173 | // Replace vector fp_to_s/uint with their X86 specific equivalent so we |
| 1174 | // don't need 2 sets of patterns. |
| 1175 | if (!N->getSimpleValueType(ResNo: 0).isVector()) |
| 1176 | break; |
| 1177 | |
| 1178 | unsigned NewOpc; |
| 1179 | switch (N->getOpcode()) { |
| 1180 | default: llvm_unreachable("Unexpected opcode!" ); |
| 1181 | case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; |
| 1182 | case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; |
| 1183 | case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; |
| 1184 | case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; |
| 1185 | case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; |
| 1186 | case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; |
| 1187 | } |
| 1188 | SDValue Res; |
| 1189 | if (N->isStrictFPOpcode()) |
| 1190 | Res = |
| 1191 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other}, |
| 1192 | Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)}); |
| 1193 | else |
| 1194 | Res = |
| 1195 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
| 1196 | Operand: N->getOperand(Num: 0)); |
| 1197 | --I; |
| 1198 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
| 1199 | ++I; |
| 1200 | MadeChange = true; |
| 1201 | continue; |
| 1202 | } |
| 1203 | case ISD::SHL: |
| 1204 | case ISD::SRA: |
| 1205 | case ISD::SRL: { |
| 1206 | // Replace vector shifts with their X86 specific equivalent so we don't |
| 1207 | // need 2 sets of patterns. |
| 1208 | if (!N->getValueType(ResNo: 0).isVector()) |
| 1209 | break; |
| 1210 | |
| 1211 | unsigned NewOpc; |
| 1212 | switch (N->getOpcode()) { |
| 1213 | default: llvm_unreachable("Unexpected opcode!" ); |
| 1214 | case ISD::SHL: NewOpc = X86ISD::VSHLV; break; |
| 1215 | case ISD::SRA: NewOpc = X86ISD::VSRAV; break; |
| 1216 | case ISD::SRL: NewOpc = X86ISD::VSRLV; break; |
| 1217 | } |
| 1218 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
| 1219 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
| 1220 | --I; |
| 1221 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
| 1222 | ++I; |
| 1223 | MadeChange = true; |
| 1224 | continue; |
| 1225 | } |
| 1226 | case ISD::ANY_EXTEND: |
| 1227 | case ISD::ANY_EXTEND_VECTOR_INREG: { |
| 1228 | // Replace vector any extend with the zero extend equivalents so we don't |
| 1229 | // need 2 sets of patterns. Ignore vXi1 extensions. |
| 1230 | if (!N->getValueType(ResNo: 0).isVector()) |
| 1231 | break; |
| 1232 | |
| 1233 | unsigned NewOpc; |
| 1234 | if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) { |
| 1235 | assert(N->getOpcode() == ISD::ANY_EXTEND && |
| 1236 | "Unexpected opcode for mask vector!" ); |
| 1237 | NewOpc = ISD::SIGN_EXTEND; |
| 1238 | } else { |
| 1239 | NewOpc = N->getOpcode() == ISD::ANY_EXTEND |
| 1240 | ? ISD::ZERO_EXTEND |
| 1241 | : ISD::ZERO_EXTEND_VECTOR_INREG; |
| 1242 | } |
| 1243 | |
| 1244 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
| 1245 | Operand: N->getOperand(Num: 0)); |
| 1246 | --I; |
| 1247 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
| 1248 | ++I; |
| 1249 | MadeChange = true; |
| 1250 | continue; |
| 1251 | } |
| 1252 | case ISD::FCEIL: |
| 1253 | case ISD::STRICT_FCEIL: |
| 1254 | case ISD::FFLOOR: |
| 1255 | case ISD::STRICT_FFLOOR: |
| 1256 | case ISD::FTRUNC: |
| 1257 | case ISD::STRICT_FTRUNC: |
| 1258 | case ISD::FROUNDEVEN: |
| 1259 | case ISD::STRICT_FROUNDEVEN: |
| 1260 | case ISD::FNEARBYINT: |
| 1261 | case ISD::STRICT_FNEARBYINT: |
| 1262 | case ISD::FRINT: |
| 1263 | case ISD::STRICT_FRINT: { |
| 1264 | // Replace fp rounding with their X86 specific equivalent so we don't |
| 1265 | // need 2 sets of patterns. |
| 1266 | unsigned Imm; |
| 1267 | switch (N->getOpcode()) { |
| 1268 | default: llvm_unreachable("Unexpected opcode!" ); |
| 1269 | case ISD::STRICT_FCEIL: |
| 1270 | case ISD::FCEIL: Imm = 0xA; break; |
| 1271 | case ISD::STRICT_FFLOOR: |
| 1272 | case ISD::FFLOOR: Imm = 0x9; break; |
| 1273 | case ISD::STRICT_FTRUNC: |
| 1274 | case ISD::FTRUNC: Imm = 0xB; break; |
| 1275 | case ISD::STRICT_FROUNDEVEN: |
| 1276 | case ISD::FROUNDEVEN: Imm = 0x8; break; |
| 1277 | case ISD::STRICT_FNEARBYINT: |
| 1278 | case ISD::FNEARBYINT: Imm = 0xC; break; |
| 1279 | case ISD::STRICT_FRINT: |
| 1280 | case ISD::FRINT: Imm = 0x4; break; |
| 1281 | } |
| 1282 | SDLoc dl(N); |
| 1283 | bool IsStrict = N->isStrictFPOpcode(); |
| 1284 | SDValue Res; |
| 1285 | if (IsStrict) |
| 1286 | Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl, |
| 1287 | ResultTys: {N->getValueType(ResNo: 0), MVT::Other}, |
| 1288 | Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), |
| 1289 | CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)}); |
| 1290 | else |
| 1291 | Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0), |
| 1292 | N1: N->getOperand(Num: 0), |
| 1293 | N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)); |
| 1294 | --I; |
| 1295 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
| 1296 | ++I; |
| 1297 | MadeChange = true; |
| 1298 | continue; |
| 1299 | } |
| 1300 | case X86ISD::FANDN: |
| 1301 | case X86ISD::FAND: |
| 1302 | case X86ISD::FOR: |
| 1303 | case X86ISD::FXOR: { |
| 1304 | // Widen scalar fp logic ops to vector to reduce isel patterns. |
| 1305 | // FIXME: Can we do this during lowering/combine. |
| 1306 | MVT VT = N->getSimpleValueType(ResNo: 0); |
| 1307 | if (VT.isVector() || VT == MVT::f128) |
| 1308 | break; |
| 1309 | |
| 1310 | MVT VecVT = VT == MVT::f64 ? MVT::v2f64 |
| 1311 | : VT == MVT::f32 ? MVT::v4f32 |
| 1312 | : MVT::v8f16; |
| 1313 | |
| 1314 | SDLoc dl(N); |
| 1315 | SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
| 1316 | Operand: N->getOperand(Num: 0)); |
| 1317 | SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
| 1318 | Operand: N->getOperand(Num: 1)); |
| 1319 | |
| 1320 | SDValue Res; |
| 1321 | if (Subtarget->hasSSE2()) { |
| 1322 | EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); |
| 1323 | Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0); |
| 1324 | Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1); |
| 1325 | unsigned Opc; |
| 1326 | switch (N->getOpcode()) { |
| 1327 | default: llvm_unreachable("Unexpected opcode!" ); |
| 1328 | case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; |
| 1329 | case X86ISD::FAND: Opc = ISD::AND; break; |
| 1330 | case X86ISD::FOR: Opc = ISD::OR; break; |
| 1331 | case X86ISD::FXOR: Opc = ISD::XOR; break; |
| 1332 | } |
| 1333 | Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1); |
| 1334 | Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res); |
| 1335 | } else { |
| 1336 | Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1); |
| 1337 | } |
| 1338 | Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res, |
| 1339 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
| 1340 | --I; |
| 1341 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
| 1342 | ++I; |
| 1343 | MadeChange = true; |
| 1344 | continue; |
| 1345 | } |
| 1346 | } |
| 1347 | |
| 1348 | if (OptLevel != CodeGenOptLevel::None && |
| 1349 | // Only do this when the target can fold the load into the call or |
| 1350 | // jmp. |
| 1351 | !Subtarget->useIndirectThunkCalls() && |
| 1352 | ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || |
| 1353 | (N->getOpcode() == X86ISD::TC_RETURN && |
| 1354 | (Subtarget->is64Bit() || |
| 1355 | !getTargetMachine().isPositionIndependent())))) { |
| 1356 | /// Also try moving call address load from outside callseq_start to just |
| 1357 | /// before the call to allow it to be folded. |
| 1358 | /// |
| 1359 | /// [Load chain] |
| 1360 | /// ^ |
| 1361 | /// | |
| 1362 | /// [Load] |
| 1363 | /// ^ ^ |
| 1364 | /// | | |
| 1365 | /// / \-- |
| 1366 | /// / | |
| 1367 | ///[CALLSEQ_START] | |
| 1368 | /// ^ | |
| 1369 | /// | | |
| 1370 | /// [LOAD/C2Reg] | |
| 1371 | /// | | |
| 1372 | /// \ / |
| 1373 | /// \ / |
| 1374 | /// [CALL] |
| 1375 | bool HasCallSeq = N->getOpcode() == X86ISD::CALL; |
| 1376 | SDValue Chain = N->getOperand(Num: 0); |
| 1377 | SDValue Load = N->getOperand(Num: 1); |
| 1378 | if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq)) |
| 1379 | continue; |
| 1380 | moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain); |
| 1381 | ++NumLoadMoved; |
| 1382 | MadeChange = true; |
| 1383 | continue; |
| 1384 | } |
| 1385 | |
| 1386 | // Lower fpround and fpextend nodes that target the FP stack to be store and |
| 1387 | // load to the stack. This is a gross hack. We would like to simply mark |
| 1388 | // these as being illegal, but when we do that, legalize produces these when |
| 1389 | // it expands calls, then expands these in the same legalize pass. We would |
| 1390 | // like dag combine to be able to hack on these between the call expansion |
| 1391 | // and the node legalization. As such this pass basically does "really |
| 1392 | // late" legalization of these inline with the X86 isel pass. |
| 1393 | // FIXME: This should only happen when not compiled with -O0. |
| 1394 | switch (N->getOpcode()) { |
| 1395 | default: continue; |
| 1396 | case ISD::FP_ROUND: |
| 1397 | case ISD::FP_EXTEND: |
| 1398 | { |
| 1399 | MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType(); |
| 1400 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
| 1401 | |
| 1402 | // If any of the sources are vectors, no fp stack involved. |
| 1403 | if (SrcVT.isVector() || DstVT.isVector()) |
| 1404 | continue; |
| 1405 | |
| 1406 | // If the source and destination are SSE registers, then this is a legal |
| 1407 | // conversion that should not be lowered. |
| 1408 | const X86TargetLowering *X86Lowering = |
| 1409 | static_cast<const X86TargetLowering *>(TLI); |
| 1410 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
| 1411 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
| 1412 | if (SrcIsSSE && DstIsSSE) |
| 1413 | continue; |
| 1414 | |
| 1415 | if (!SrcIsSSE && !DstIsSSE) { |
| 1416 | // If this is an FPStack extension, it is a noop. |
| 1417 | if (N->getOpcode() == ISD::FP_EXTEND) |
| 1418 | continue; |
| 1419 | // If this is a value-preserving FPStack truncation, it is a noop. |
| 1420 | if (N->getConstantOperandVal(Num: 1)) |
| 1421 | continue; |
| 1422 | } |
| 1423 | |
| 1424 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
| 1425 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
| 1426 | // operations. Based on this, decide what we want to do. |
| 1427 | MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; |
| 1428 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
| 1429 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
| 1430 | MachinePointerInfo MPI = |
| 1431 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
| 1432 | SDLoc dl(N); |
| 1433 | |
| 1434 | // FIXME: optimize the case where the src/dest is a load or store? |
| 1435 | |
| 1436 | SDValue Store = CurDAG->getTruncStore( |
| 1437 | Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT); |
| 1438 | SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store, |
| 1439 | Ptr: MemTmp, PtrInfo: MPI, MemVT); |
| 1440 | |
| 1441 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
| 1442 | // extload we created. This will cause general havok on the dag because |
| 1443 | // anything below the conversion could be folded into other existing nodes. |
| 1444 | // To avoid invalidating 'I', back it up to the convert node. |
| 1445 | --I; |
| 1446 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result); |
| 1447 | break; |
| 1448 | } |
| 1449 | |
| 1450 | //The sequence of events for lowering STRICT_FP versions of these nodes requires |
| 1451 | //dealing with the chain differently, as there is already a preexisting chain. |
| 1452 | case ISD::STRICT_FP_ROUND: |
| 1453 | case ISD::STRICT_FP_EXTEND: |
| 1454 | { |
| 1455 | MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType(); |
| 1456 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
| 1457 | |
| 1458 | // If any of the sources are vectors, no fp stack involved. |
| 1459 | if (SrcVT.isVector() || DstVT.isVector()) |
| 1460 | continue; |
| 1461 | |
| 1462 | // If the source and destination are SSE registers, then this is a legal |
| 1463 | // conversion that should not be lowered. |
| 1464 | const X86TargetLowering *X86Lowering = |
| 1465 | static_cast<const X86TargetLowering *>(TLI); |
| 1466 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
| 1467 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
| 1468 | if (SrcIsSSE && DstIsSSE) |
| 1469 | continue; |
| 1470 | |
| 1471 | if (!SrcIsSSE && !DstIsSSE) { |
| 1472 | // If this is an FPStack extension, it is a noop. |
| 1473 | if (N->getOpcode() == ISD::STRICT_FP_EXTEND) |
| 1474 | continue; |
| 1475 | // If this is a value-preserving FPStack truncation, it is a noop. |
| 1476 | if (N->getConstantOperandVal(Num: 2)) |
| 1477 | continue; |
| 1478 | } |
| 1479 | |
| 1480 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
| 1481 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
| 1482 | // operations. Based on this, decide what we want to do. |
| 1483 | MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; |
| 1484 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
| 1485 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
| 1486 | MachinePointerInfo MPI = |
| 1487 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
| 1488 | SDLoc dl(N); |
| 1489 | |
| 1490 | // FIXME: optimize the case where the src/dest is a load or store? |
| 1491 | |
| 1492 | //Since the operation is StrictFP, use the preexisting chain. |
| 1493 | SDValue Store, Result; |
| 1494 | if (!SrcIsSSE) { |
| 1495 | SDVTList VTs = CurDAG->getVTList(VT: MVT::Other); |
| 1496 | SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp}; |
| 1497 | Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT, |
| 1498 | PtrInfo: MPI, /*Align*/ Alignment: std::nullopt, |
| 1499 | Flags: MachineMemOperand::MOStore); |
| 1500 | if (N->getFlags().hasNoFPExcept()) { |
| 1501 | SDNodeFlags Flags = Store->getFlags(); |
| 1502 | Flags.setNoFPExcept(true); |
| 1503 | Store->setFlags(Flags); |
| 1504 | } |
| 1505 | } else { |
| 1506 | assert(SrcVT == MemVT && "Unexpected VT!" ); |
| 1507 | Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp, |
| 1508 | PtrInfo: MPI); |
| 1509 | } |
| 1510 | |
| 1511 | if (!DstIsSSE) { |
| 1512 | SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other); |
| 1513 | SDValue Ops[] = {Store, MemTmp}; |
| 1514 | Result = CurDAG->getMemIntrinsicNode( |
| 1515 | Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI, |
| 1516 | /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad); |
| 1517 | if (N->getFlags().hasNoFPExcept()) { |
| 1518 | SDNodeFlags Flags = Result->getFlags(); |
| 1519 | Flags.setNoFPExcept(true); |
| 1520 | Result->setFlags(Flags); |
| 1521 | } |
| 1522 | } else { |
| 1523 | assert(DstVT == MemVT && "Unexpected VT!" ); |
| 1524 | Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI); |
| 1525 | } |
| 1526 | |
| 1527 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
| 1528 | // extload we created. This will cause general havok on the dag because |
| 1529 | // anything below the conversion could be folded into other existing nodes. |
| 1530 | // To avoid invalidating 'I', back it up to the convert node. |
| 1531 | --I; |
| 1532 | CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode()); |
| 1533 | break; |
| 1534 | } |
| 1535 | } |
| 1536 | |
| 1537 | |
| 1538 | // Now that we did that, the node is dead. Increment the iterator to the |
| 1539 | // next node to process, then delete N. |
| 1540 | ++I; |
| 1541 | MadeChange = true; |
| 1542 | } |
| 1543 | |
| 1544 | // Remove any dead nodes that may have been left behind. |
| 1545 | if (MadeChange) |
| 1546 | CurDAG->RemoveDeadNodes(); |
| 1547 | } |
| 1548 | |
| 1549 | // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. |
| 1550 | bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { |
| 1551 | unsigned Opc = N->getMachineOpcode(); |
| 1552 | if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && |
| 1553 | Opc != X86::MOVSX64rr8) |
| 1554 | return false; |
| 1555 | |
| 1556 | SDValue N0 = N->getOperand(Num: 0); |
| 1557 | |
| 1558 | // We need to be extracting the lower bit of an extend. |
| 1559 | if (!N0.isMachineOpcode() || |
| 1560 | N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || |
| 1561 | N0.getConstantOperandVal(i: 1) != X86::sub_8bit) |
| 1562 | return false; |
| 1563 | |
| 1564 | // We're looking for either a movsx or movzx to match the original opcode. |
| 1565 | unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX |
| 1566 | : X86::MOVSX32rr8_NOREX; |
| 1567 | SDValue N00 = N0.getOperand(i: 0); |
| 1568 | if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) |
| 1569 | return false; |
| 1570 | |
| 1571 | if (Opc == X86::MOVSX64rr8) { |
| 1572 | // If we had a sign extend from 8 to 64 bits. We still need to go from 32 |
| 1573 | // to 64. |
| 1574 | MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N), |
| 1575 | VT: MVT::i64, Op1: N00); |
| 1576 | ReplaceUses(F: N, T: Extend); |
| 1577 | } else { |
| 1578 | // Ok we can drop this extend and just use the original extend. |
| 1579 | ReplaceUses(F: N, T: N00.getNode()); |
| 1580 | } |
| 1581 | |
| 1582 | return true; |
| 1583 | } |
| 1584 | |
| 1585 | void X86DAGToDAGISel::PostprocessISelDAG() { |
| 1586 | // Skip peepholes at -O0. |
| 1587 | if (TM.getOptLevel() == CodeGenOptLevel::None) |
| 1588 | return; |
| 1589 | |
| 1590 | SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); |
| 1591 | |
| 1592 | bool MadeChange = false; |
| 1593 | while (Position != CurDAG->allnodes_begin()) { |
| 1594 | SDNode *N = &*--Position; |
| 1595 | // Skip dead nodes and any non-machine opcodes. |
| 1596 | if (N->use_empty() || !N->isMachineOpcode()) |
| 1597 | continue; |
| 1598 | |
| 1599 | if (tryOptimizeRem8Extend(N)) { |
| 1600 | MadeChange = true; |
| 1601 | continue; |
| 1602 | } |
| 1603 | |
| 1604 | unsigned Opc = N->getMachineOpcode(); |
| 1605 | switch (Opc) { |
| 1606 | default: |
| 1607 | continue; |
| 1608 | // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr |
| 1609 | case X86::TEST8rr: |
| 1610 | case X86::TEST16rr: |
| 1611 | case X86::TEST32rr: |
| 1612 | case X86::TEST64rr: |
| 1613 | // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr |
| 1614 | case X86::CTEST8rr: |
| 1615 | case X86::CTEST16rr: |
| 1616 | case X86::CTEST32rr: |
| 1617 | case X86::CTEST64rr: { |
| 1618 | auto &Op0 = N->getOperand(Num: 0); |
| 1619 | if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) || |
| 1620 | !Op0.isMachineOpcode()) |
| 1621 | continue; |
| 1622 | SDValue And = N->getOperand(Num: 0); |
| 1623 | #define CASE_ND(OP) \ |
| 1624 | case X86::OP: \ |
| 1625 | case X86::OP##_ND: |
| 1626 | switch (And.getMachineOpcode()) { |
| 1627 | default: |
| 1628 | continue; |
| 1629 | CASE_ND(AND8rr) |
| 1630 | CASE_ND(AND16rr) |
| 1631 | CASE_ND(AND32rr) |
| 1632 | CASE_ND(AND64rr) { |
| 1633 | if (And->hasAnyUseOfValue(Value: 1)) |
| 1634 | continue; |
| 1635 | SmallVector<SDValue> Ops(N->op_values()); |
| 1636 | Ops[0] = And.getOperand(i: 0); |
| 1637 | Ops[1] = And.getOperand(i: 1); |
| 1638 | MachineSDNode *Test = |
| 1639 | CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops); |
| 1640 | ReplaceUses(F: N, T: Test); |
| 1641 | MadeChange = true; |
| 1642 | continue; |
| 1643 | } |
| 1644 | CASE_ND(AND8rm) |
| 1645 | CASE_ND(AND16rm) |
| 1646 | CASE_ND(AND32rm) |
| 1647 | CASE_ND(AND64rm) { |
| 1648 | if (And->hasAnyUseOfValue(Value: 1)) |
| 1649 | continue; |
| 1650 | unsigned NewOpc; |
| 1651 | bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc); |
| 1652 | #define FROM_TO(A, B) \ |
| 1653 | CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \ |
| 1654 | break; |
| 1655 | switch (And.getMachineOpcode()) { |
| 1656 | FROM_TO(AND8rm, TEST8mr); |
| 1657 | FROM_TO(AND16rm, TEST16mr); |
| 1658 | FROM_TO(AND32rm, TEST32mr); |
| 1659 | FROM_TO(AND64rm, TEST64mr); |
| 1660 | } |
| 1661 | #undef FROM_TO |
| 1662 | #undef CASE_ND |
| 1663 | // Need to swap the memory and register operand. |
| 1664 | SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2), |
| 1665 | And.getOperand(i: 3), And.getOperand(i: 4), |
| 1666 | And.getOperand(i: 5), And.getOperand(i: 0)}; |
| 1667 | // CC, Cflags. |
| 1668 | if (IsCTESTCC) { |
| 1669 | Ops.push_back(Elt: N->getOperand(Num: 2)); |
| 1670 | Ops.push_back(Elt: N->getOperand(Num: 3)); |
| 1671 | } |
| 1672 | // Chain of memory load |
| 1673 | Ops.push_back(Elt: And.getOperand(i: 6)); |
| 1674 | // Glue |
| 1675 | if (IsCTESTCC) |
| 1676 | Ops.push_back(Elt: N->getOperand(Num: 4)); |
| 1677 | |
| 1678 | MachineSDNode *Test = CurDAG->getMachineNode( |
| 1679 | Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops); |
| 1680 | CurDAG->setNodeMemRefs( |
| 1681 | N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands()); |
| 1682 | ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1)); |
| 1683 | ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0)); |
| 1684 | MadeChange = true; |
| 1685 | continue; |
| 1686 | } |
| 1687 | } |
| 1688 | } |
| 1689 | // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is |
| 1690 | // used. We're doing this late so we can prefer to fold the AND into masked |
| 1691 | // comparisons. Doing that can be better for the live range of the mask |
| 1692 | // register. |
| 1693 | case X86::KORTESTBkk: |
| 1694 | case X86::KORTESTWkk: |
| 1695 | case X86::KORTESTDkk: |
| 1696 | case X86::KORTESTQkk: { |
| 1697 | SDValue Op0 = N->getOperand(Num: 0); |
| 1698 | if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) || |
| 1699 | !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0))) |
| 1700 | continue; |
| 1701 | #define CASE(A) \ |
| 1702 | case X86::A: \ |
| 1703 | break; |
| 1704 | switch (Op0.getMachineOpcode()) { |
| 1705 | default: |
| 1706 | continue; |
| 1707 | CASE(KANDBkk) |
| 1708 | CASE(KANDWkk) |
| 1709 | CASE(KANDDkk) |
| 1710 | CASE(KANDQkk) |
| 1711 | } |
| 1712 | unsigned NewOpc; |
| 1713 | #define FROM_TO(A, B) \ |
| 1714 | case X86::A: \ |
| 1715 | NewOpc = X86::B; \ |
| 1716 | break; |
| 1717 | switch (Opc) { |
| 1718 | FROM_TO(KORTESTBkk, KTESTBkk) |
| 1719 | FROM_TO(KORTESTWkk, KTESTWkk) |
| 1720 | FROM_TO(KORTESTDkk, KTESTDkk) |
| 1721 | FROM_TO(KORTESTQkk, KTESTQkk) |
| 1722 | } |
| 1723 | // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other |
| 1724 | // KAND instructions and KTEST use the same ISA feature. |
| 1725 | if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI()) |
| 1726 | continue; |
| 1727 | #undef FROM_TO |
| 1728 | MachineSDNode *KTest = CurDAG->getMachineNode( |
| 1729 | Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1)); |
| 1730 | ReplaceUses(F: N, T: KTest); |
| 1731 | MadeChange = true; |
| 1732 | continue; |
| 1733 | } |
| 1734 | // Attempt to remove vectors moves that were inserted to zero upper bits. |
| 1735 | case TargetOpcode::SUBREG_TO_REG: { |
| 1736 | unsigned SubRegIdx = N->getConstantOperandVal(Num: 2); |
| 1737 | if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) |
| 1738 | continue; |
| 1739 | |
| 1740 | SDValue Move = N->getOperand(Num: 1); |
| 1741 | if (!Move.isMachineOpcode()) |
| 1742 | continue; |
| 1743 | |
| 1744 | // Make sure its one of the move opcodes we recognize. |
| 1745 | switch (Move.getMachineOpcode()) { |
| 1746 | default: |
| 1747 | continue; |
| 1748 | CASE(VMOVAPDrr) CASE(VMOVUPDrr) |
| 1749 | CASE(VMOVAPSrr) CASE(VMOVUPSrr) |
| 1750 | CASE(VMOVDQArr) CASE(VMOVDQUrr) |
| 1751 | CASE(VMOVAPDYrr) CASE(VMOVUPDYrr) |
| 1752 | CASE(VMOVAPSYrr) CASE(VMOVUPSYrr) |
| 1753 | CASE(VMOVDQAYrr) CASE(VMOVDQUYrr) |
| 1754 | CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr) |
| 1755 | CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr) |
| 1756 | CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr) |
| 1757 | CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr) |
| 1758 | CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr) |
| 1759 | CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr) |
| 1760 | CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr) |
| 1761 | CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr) |
| 1762 | } |
| 1763 | #undef CASE |
| 1764 | |
| 1765 | SDValue In = Move.getOperand(i: 0); |
| 1766 | if (!In.isMachineOpcode() || |
| 1767 | In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) |
| 1768 | continue; |
| 1769 | |
| 1770 | // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers |
| 1771 | // the SHA instructions which use a legacy encoding. |
| 1772 | uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags; |
| 1773 | if ((TSFlags & X86II::EncodingMask) != X86II::VEX && |
| 1774 | (TSFlags & X86II::EncodingMask) != X86II::EVEX && |
| 1775 | (TSFlags & X86II::EncodingMask) != X86II::XOP) |
| 1776 | continue; |
| 1777 | |
| 1778 | // Producing instruction is another vector instruction. We can drop the |
| 1779 | // move. |
| 1780 | CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2)); |
| 1781 | MadeChange = true; |
| 1782 | } |
| 1783 | } |
| 1784 | } |
| 1785 | |
| 1786 | if (MadeChange) |
| 1787 | CurDAG->RemoveDeadNodes(); |
| 1788 | } |
| 1789 | |
| 1790 | |
| 1791 | /// Emit any code that needs to be executed only in the main function. |
| 1792 | void X86DAGToDAGISel::emitSpecialCodeForMain() { |
| 1793 | if (Subtarget->isTargetCygMing()) { |
| 1794 | TargetLowering::ArgListTy Args; |
| 1795 | auto &DL = CurDAG->getDataLayout(); |
| 1796 | |
| 1797 | TargetLowering::CallLoweringInfo CLI(*CurDAG); |
| 1798 | CLI.setChain(CurDAG->getRoot()) |
| 1799 | .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()), |
| 1800 | Target: CurDAG->getExternalSymbol(Sym: "__main" , VT: TLI->getPointerTy(DL)), |
| 1801 | ArgsList: std::move(Args)); |
| 1802 | const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); |
| 1803 | std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); |
| 1804 | CurDAG->setRoot(Result.second); |
| 1805 | } |
| 1806 | } |
| 1807 | |
| 1808 | void X86DAGToDAGISel::emitFunctionEntryCode() { |
| 1809 | // If this is main, emit special code for main. |
| 1810 | const Function &F = MF->getFunction(); |
| 1811 | if (F.hasExternalLinkage() && F.getName() == "main" ) |
| 1812 | emitSpecialCodeForMain(); |
| 1813 | } |
| 1814 | |
| 1815 | static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) { |
| 1816 | // We can run into an issue where a frame index or a register base |
| 1817 | // includes a displacement that, when added to the explicit displacement, |
| 1818 | // will overflow the displacement field. Assuming that the |
| 1819 | // displacement fits into a 31-bit integer (which is only slightly more |
| 1820 | // aggressive than the current fundamental assumption that it fits into |
| 1821 | // a 32-bit integer), a 31-bit disp should always be safe. |
| 1822 | return isInt<31>(x: Val); |
| 1823 | } |
| 1824 | |
| 1825 | bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, |
| 1826 | X86ISelAddressMode &AM) { |
| 1827 | // We may have already matched a displacement and the caller just added the |
| 1828 | // symbolic displacement. So we still need to do the checks even if Offset |
| 1829 | // is zero. |
| 1830 | |
| 1831 | int64_t Val = AM.Disp + Offset; |
| 1832 | |
| 1833 | // Cannot combine ExternalSymbol displacements with integer offsets. |
| 1834 | if (Val != 0 && (AM.ES || AM.MCSym)) |
| 1835 | return true; |
| 1836 | |
| 1837 | CodeModel::Model M = TM.getCodeModel(); |
| 1838 | if (Subtarget->is64Bit()) { |
| 1839 | if (Val != 0 && |
| 1840 | !X86::isOffsetSuitableForCodeModel(Offset: Val, M, |
| 1841 | hasSymbolicDisplacement: AM.hasSymbolicDisplacement())) |
| 1842 | return true; |
| 1843 | // In addition to the checks required for a register base, check that |
| 1844 | // we do not try to use an unsafe Disp with a frame index. |
| 1845 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && |
| 1846 | !isDispSafeForFrameIndexOrRegBase(Val)) |
| 1847 | return true; |
| 1848 | // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to |
| 1849 | // 64 bits. Instructions with 32-bit register addresses perform this zero |
| 1850 | // extension for us and we can safely ignore the high bits of Offset. |
| 1851 | // Instructions with only a 32-bit immediate address do not, though: they |
| 1852 | // sign extend instead. This means only address the low 2GB of address space |
| 1853 | // is directly addressable, we need indirect addressing for the high 2GB of |
| 1854 | // address space. |
| 1855 | // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the |
| 1856 | // implicit zero extension of instructions would cover up any problem. |
| 1857 | // However, we have asserts elsewhere that get triggered if we do, so keep |
| 1858 | // the checks for now. |
| 1859 | // TODO: We would actually be able to accept these, as well as the same |
| 1860 | // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand |
| 1861 | // to get an address size override to be emitted. However, this |
| 1862 | // pseudo-register is not part of any register class and therefore causes |
| 1863 | // MIR verification to fail. |
| 1864 | if (Subtarget->isTarget64BitILP32() && |
| 1865 | !isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) && |
| 1866 | !AM.hasBaseOrIndexReg()) |
| 1867 | return true; |
| 1868 | } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val)) |
| 1869 | // For 32-bit X86, make sure the displacement still isn't close to the |
| 1870 | // expressible limit. |
| 1871 | return true; |
| 1872 | AM.Disp = Val; |
| 1873 | return false; |
| 1874 | } |
| 1875 | |
| 1876 | bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
| 1877 | bool AllowSegmentRegForX32) { |
| 1878 | SDValue Address = N->getOperand(Num: 1); |
| 1879 | |
| 1880 | // load gs:0 -> GS segment register. |
| 1881 | // load fs:0 -> FS segment register. |
| 1882 | // |
| 1883 | // This optimization is generally valid because the GNU TLS model defines that |
| 1884 | // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode |
| 1885 | // with 32-bit registers, as we get in ILP32 mode, those registers are first |
| 1886 | // zero-extended to 64 bits and then added it to the base address, which gives |
| 1887 | // unwanted results when the register holds a negative value. |
| 1888 | // For more information see http://people.redhat.com/drepper/tls.pdf |
| 1889 | if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr && |
| 1890 | !IndirectTlsSegRefs && |
| 1891 | (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || |
| 1892 | Subtarget->isTargetFuchsia())) { |
| 1893 | if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) |
| 1894 | return true; |
| 1895 | switch (N->getPointerInfo().getAddrSpace()) { |
| 1896 | case X86AS::GS: |
| 1897 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
| 1898 | return false; |
| 1899 | case X86AS::FS: |
| 1900 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
| 1901 | return false; |
| 1902 | // Address space X86AS::SS is not handled here, because it is not used to |
| 1903 | // address TLS areas. |
| 1904 | } |
| 1905 | } |
| 1906 | |
| 1907 | return true; |
| 1908 | } |
| 1909 | |
| 1910 | /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing |
| 1911 | /// mode. These wrap things that will resolve down into a symbol reference. |
| 1912 | /// If no match is possible, this returns true, otherwise it returns false. |
| 1913 | bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { |
| 1914 | // If the addressing mode already has a symbol as the displacement, we can |
| 1915 | // never match another symbol. |
| 1916 | if (AM.hasSymbolicDisplacement()) |
| 1917 | return true; |
| 1918 | |
| 1919 | bool IsRIPRelTLS = false; |
| 1920 | bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; |
| 1921 | if (IsRIPRel) { |
| 1922 | SDValue Val = N.getOperand(i: 0); |
| 1923 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
| 1924 | IsRIPRelTLS = true; |
| 1925 | } |
| 1926 | |
| 1927 | // We can't use an addressing mode in the 64-bit large code model. |
| 1928 | // Global TLS addressing is an exception. In the medium code model, |
| 1929 | // we use can use a mode when RIP wrappers are present. |
| 1930 | // That signifies access to globals that are known to be "near", |
| 1931 | // such as the GOT itself. |
| 1932 | CodeModel::Model M = TM.getCodeModel(); |
| 1933 | if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS) |
| 1934 | return true; |
| 1935 | |
| 1936 | // Base and index reg must be 0 in order to use %rip as base. |
| 1937 | if (IsRIPRel && AM.hasBaseOrIndexReg()) |
| 1938 | return true; |
| 1939 | |
| 1940 | // Make a local copy in case we can't do this fold. |
| 1941 | X86ISelAddressMode Backup = AM; |
| 1942 | |
| 1943 | int64_t Offset = 0; |
| 1944 | SDValue N0 = N.getOperand(i: 0); |
| 1945 | if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) { |
| 1946 | AM.GV = G->getGlobal(); |
| 1947 | AM.SymbolFlags = G->getTargetFlags(); |
| 1948 | Offset = G->getOffset(); |
| 1949 | } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) { |
| 1950 | AM.CP = CP->getConstVal(); |
| 1951 | AM.Alignment = CP->getAlign(); |
| 1952 | AM.SymbolFlags = CP->getTargetFlags(); |
| 1953 | Offset = CP->getOffset(); |
| 1954 | } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) { |
| 1955 | AM.ES = S->getSymbol(); |
| 1956 | AM.SymbolFlags = S->getTargetFlags(); |
| 1957 | } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) { |
| 1958 | AM.MCSym = S->getMCSymbol(); |
| 1959 | } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) { |
| 1960 | AM.JT = J->getIndex(); |
| 1961 | AM.SymbolFlags = J->getTargetFlags(); |
| 1962 | } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) { |
| 1963 | AM.BlockAddr = BA->getBlockAddress(); |
| 1964 | AM.SymbolFlags = BA->getTargetFlags(); |
| 1965 | Offset = BA->getOffset(); |
| 1966 | } else |
| 1967 | llvm_unreachable("Unhandled symbol reference node." ); |
| 1968 | |
| 1969 | // Can't use an addressing mode with large globals. |
| 1970 | if (Subtarget->is64Bit() && !IsRIPRel && AM.GV && |
| 1971 | TM.isLargeGlobalValue(GV: AM.GV)) { |
| 1972 | AM = Backup; |
| 1973 | return true; |
| 1974 | } |
| 1975 | |
| 1976 | if (foldOffsetIntoAddress(Offset, AM)) { |
| 1977 | AM = Backup; |
| 1978 | return true; |
| 1979 | } |
| 1980 | |
| 1981 | if (IsRIPRel) |
| 1982 | AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64)); |
| 1983 | |
| 1984 | // Commit the changes now that we know this fold is safe. |
| 1985 | return false; |
| 1986 | } |
| 1987 | |
| 1988 | /// Add the specified node to the specified addressing mode, returning true if |
| 1989 | /// it cannot be done. This just pattern matches for the addressing mode. |
| 1990 | bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { |
| 1991 | if (matchAddressRecursively(N, AM, Depth: 0)) |
| 1992 | return true; |
| 1993 | |
| 1994 | // Post-processing: Make a second attempt to fold a load, if we now know |
| 1995 | // that there will not be any other register. This is only performed for |
| 1996 | // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded |
| 1997 | // any foldable load the first time. |
| 1998 | if (Subtarget->isTarget64BitILP32() && |
| 1999 | AM.BaseType == X86ISelAddressMode::RegBase && |
| 2000 | AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { |
| 2001 | SDValue Save_Base_Reg = AM.Base_Reg; |
| 2002 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) { |
| 2003 | AM.Base_Reg = SDValue(); |
| 2004 | if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true)) |
| 2005 | AM.Base_Reg = Save_Base_Reg; |
| 2006 | } |
| 2007 | } |
| 2008 | |
| 2009 | // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has |
| 2010 | // a smaller encoding and avoids a scaled-index. |
| 2011 | if (AM.Scale == 2 && |
| 2012 | AM.BaseType == X86ISelAddressMode::RegBase && |
| 2013 | AM.Base_Reg.getNode() == nullptr) { |
| 2014 | AM.Base_Reg = AM.IndexReg; |
| 2015 | AM.Scale = 1; |
| 2016 | } |
| 2017 | |
| 2018 | // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, |
| 2019 | // because it has a smaller encoding. |
| 2020 | if (TM.getCodeModel() != CodeModel::Large && |
| 2021 | (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() && |
| 2022 | AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && |
| 2023 | AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr && |
| 2024 | AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) { |
| 2025 | // However, when GV is a local function symbol and in the same section as |
| 2026 | // the current instruction, and AM.Disp is negative and near INT32_MIN, |
| 2027 | // referencing GV+Disp generates a relocation referencing the section symbol |
| 2028 | // with an even smaller offset, which might underflow. We should bail out if |
| 2029 | // the negative offset is too close to INT32_MIN. Actually, we are more |
| 2030 | // conservative here, using a smaller magic number also used by |
| 2031 | // isOffsetSuitableForCodeModel. |
| 2032 | if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -16 * 1024 * 1024) |
| 2033 | return true; |
| 2034 | |
| 2035 | AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64); |
| 2036 | } |
| 2037 | |
| 2038 | return false; |
| 2039 | } |
| 2040 | |
| 2041 | bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, |
| 2042 | unsigned Depth) { |
| 2043 | // Add an artificial use to this node so that we can keep track of |
| 2044 | // it if it gets CSE'd with a different node. |
| 2045 | HandleSDNode Handle(N); |
| 2046 | |
| 2047 | X86ISelAddressMode Backup = AM; |
| 2048 | if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) && |
| 2049 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1)) |
| 2050 | return false; |
| 2051 | AM = Backup; |
| 2052 | |
| 2053 | // Try again after commutating the operands. |
| 2054 | if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
| 2055 | Depth: Depth + 1) && |
| 2056 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1)) |
| 2057 | return false; |
| 2058 | AM = Backup; |
| 2059 | |
| 2060 | // If we couldn't fold both operands into the address at the same time, |
| 2061 | // see if we can just put each operand into a register and fold at least |
| 2062 | // the add. |
| 2063 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
| 2064 | !AM.Base_Reg.getNode() && |
| 2065 | !AM.IndexReg.getNode()) { |
| 2066 | N = Handle.getValue(); |
| 2067 | AM.Base_Reg = N.getOperand(i: 0); |
| 2068 | AM.IndexReg = N.getOperand(i: 1); |
| 2069 | AM.Scale = 1; |
| 2070 | return false; |
| 2071 | } |
| 2072 | N = Handle.getValue(); |
| 2073 | return true; |
| 2074 | } |
| 2075 | |
| 2076 | // Insert a node into the DAG at least before the Pos node's position. This |
| 2077 | // will reposition the node as needed, and will assign it a node ID that is <= |
| 2078 | // the Pos node's ID. Note that this does *not* preserve the uniqueness of node |
| 2079 | // IDs! The selection DAG must no longer depend on their uniqueness when this |
| 2080 | // is used. |
| 2081 | static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { |
| 2082 | if (N->getNodeId() == -1 || |
| 2083 | (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) > |
| 2084 | SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) { |
| 2085 | DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode()); |
| 2086 | // Mark Node as invalid for pruning as after this it may be a successor to a |
| 2087 | // selected node but otherwise be in the same position of Pos. |
| 2088 | // Conservatively mark it with the same -abs(Id) to assure node id |
| 2089 | // invariant is preserved. |
| 2090 | N->setNodeId(Pos->getNodeId()); |
| 2091 | SelectionDAGISel::InvalidateNodeId(N: N.getNode()); |
| 2092 | } |
| 2093 | } |
| 2094 | |
| 2095 | // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if |
| 2096 | // safe. This allows us to convert the shift and and into an h-register |
| 2097 | // extract and a scaled index. Returns false if the simplification is |
| 2098 | // performed. |
| 2099 | static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, |
| 2100 | uint64_t Mask, |
| 2101 | SDValue Shift, SDValue X, |
| 2102 | X86ISelAddressMode &AM) { |
| 2103 | if (Shift.getOpcode() != ISD::SRL || |
| 2104 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
| 2105 | !Shift.hasOneUse()) |
| 2106 | return true; |
| 2107 | |
| 2108 | int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1); |
| 2109 | if (ScaleLog <= 0 || ScaleLog >= 4 || |
| 2110 | Mask != (0xffu << ScaleLog)) |
| 2111 | return true; |
| 2112 | |
| 2113 | MVT XVT = X.getSimpleValueType(); |
| 2114 | MVT VT = N.getSimpleValueType(); |
| 2115 | SDLoc DL(N); |
| 2116 | SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8); |
| 2117 | SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT); |
| 2118 | SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight); |
| 2119 | SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask); |
| 2120 | SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT); |
| 2121 | SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8); |
| 2122 | SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount); |
| 2123 | |
| 2124 | // Insert the new nodes into the topological ordering. We must do this in |
| 2125 | // a valid topological ordering as nothing is going to go back and re-sort |
| 2126 | // these nodes. We continually insert before 'N' in sequence as this is |
| 2127 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
| 2128 | // hierarchy left to express. |
| 2129 | insertDAGNode(DAG, Pos: N, N: Eight); |
| 2130 | insertDAGNode(DAG, Pos: N, N: NewMask); |
| 2131 | insertDAGNode(DAG, Pos: N, N: Srl); |
| 2132 | insertDAGNode(DAG, Pos: N, N: And); |
| 2133 | insertDAGNode(DAG, Pos: N, N: Ext); |
| 2134 | insertDAGNode(DAG, Pos: N, N: ShlCount); |
| 2135 | insertDAGNode(DAG, Pos: N, N: Shl); |
| 2136 | DAG.ReplaceAllUsesWith(From: N, To: Shl); |
| 2137 | DAG.RemoveDeadNode(N: N.getNode()); |
| 2138 | AM.IndexReg = Ext; |
| 2139 | AM.Scale = (1 << ScaleLog); |
| 2140 | return false; |
| 2141 | } |
| 2142 | |
| 2143 | // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this |
| 2144 | // allows us to fold the shift into this addressing mode. Returns false if the |
| 2145 | // transform succeeded. |
| 2146 | static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, |
| 2147 | X86ISelAddressMode &AM) { |
| 2148 | SDValue Shift = N.getOperand(i: 0); |
| 2149 | |
| 2150 | // Use a signed mask so that shifting right will insert sign bits. These |
| 2151 | // bits will be removed when we shift the result left so it doesn't matter |
| 2152 | // what we use. This might allow a smaller immediate encoding. |
| 2153 | int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue(); |
| 2154 | |
| 2155 | // If we have an any_extend feeding the AND, look through it to see if there |
| 2156 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
| 2157 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
| 2158 | bool FoundAnyExtend = false; |
| 2159 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
| 2160 | Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 && |
| 2161 | isUInt<32>(x: Mask)) { |
| 2162 | FoundAnyExtend = true; |
| 2163 | Shift = Shift.getOperand(i: 0); |
| 2164 | } |
| 2165 | |
| 2166 | if (Shift.getOpcode() != ISD::SHL || |
| 2167 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
| 2168 | return true; |
| 2169 | |
| 2170 | SDValue X = Shift.getOperand(i: 0); |
| 2171 | |
| 2172 | // Not likely to be profitable if either the AND or SHIFT node has more |
| 2173 | // than one use (unless all uses are for address computation). Besides, |
| 2174 | // isel mechanism requires their node ids to be reused. |
| 2175 | if (!N.hasOneUse() || !Shift.hasOneUse()) |
| 2176 | return true; |
| 2177 | |
| 2178 | // Verify that the shift amount is something we can fold. |
| 2179 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
| 2180 | if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) |
| 2181 | return true; |
| 2182 | |
| 2183 | MVT VT = N.getSimpleValueType(); |
| 2184 | SDLoc DL(N); |
| 2185 | if (FoundAnyExtend) { |
| 2186 | SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X); |
| 2187 | insertDAGNode(DAG, Pos: N, N: NewX); |
| 2188 | X = NewX; |
| 2189 | } |
| 2190 | |
| 2191 | SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT); |
| 2192 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask); |
| 2193 | SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1)); |
| 2194 | |
| 2195 | // Insert the new nodes into the topological ordering. We must do this in |
| 2196 | // a valid topological ordering as nothing is going to go back and re-sort |
| 2197 | // these nodes. We continually insert before 'N' in sequence as this is |
| 2198 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
| 2199 | // hierarchy left to express. |
| 2200 | insertDAGNode(DAG, Pos: N, N: NewMask); |
| 2201 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
| 2202 | insertDAGNode(DAG, Pos: N, N: NewShift); |
| 2203 | DAG.ReplaceAllUsesWith(From: N, To: NewShift); |
| 2204 | DAG.RemoveDeadNode(N: N.getNode()); |
| 2205 | |
| 2206 | AM.Scale = 1 << ShiftAmt; |
| 2207 | AM.IndexReg = NewAnd; |
| 2208 | return false; |
| 2209 | } |
| 2210 | |
| 2211 | // Implement some heroics to detect shifts of masked values where the mask can |
| 2212 | // be replaced by extending the shift and undoing that in the addressing mode |
| 2213 | // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and |
| 2214 | // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in |
| 2215 | // the addressing mode. This results in code such as: |
| 2216 | // |
| 2217 | // int f(short *y, int *lookup_table) { |
| 2218 | // ... |
| 2219 | // return *y + lookup_table[*y >> 11]; |
| 2220 | // } |
| 2221 | // |
| 2222 | // Turning into: |
| 2223 | // movzwl (%rdi), %eax |
| 2224 | // movl %eax, %ecx |
| 2225 | // shrl $11, %ecx |
| 2226 | // addl (%rsi,%rcx,4), %eax |
| 2227 | // |
| 2228 | // Instead of: |
| 2229 | // movzwl (%rdi), %eax |
| 2230 | // movl %eax, %ecx |
| 2231 | // shrl $9, %ecx |
| 2232 | // andl $124, %rcx |
| 2233 | // addl (%rsi,%rcx), %eax |
| 2234 | // |
| 2235 | // Note that this function assumes the mask is provided as a mask *after* the |
| 2236 | // value is shifted. The input chain may or may not match that, but computing |
| 2237 | // such a mask is trivial. |
| 2238 | static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, |
| 2239 | uint64_t Mask, |
| 2240 | SDValue Shift, SDValue X, |
| 2241 | X86ISelAddressMode &AM) { |
| 2242 | if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || |
| 2243 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
| 2244 | return true; |
| 2245 | |
| 2246 | // We need to ensure that mask is a continuous run of bits. |
| 2247 | unsigned MaskIdx, MaskLen; |
| 2248 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
| 2249 | return true; |
| 2250 | unsigned MaskLZ = 64 - (MaskIdx + MaskLen); |
| 2251 | |
| 2252 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
| 2253 | |
| 2254 | // The amount of shift we're trying to fit into the addressing mode is taken |
| 2255 | // from the shifted mask index (number of trailing zeros of the mask). |
| 2256 | unsigned AMShiftAmt = MaskIdx; |
| 2257 | |
| 2258 | // There is nothing we can do here unless the mask is removing some bits. |
| 2259 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
| 2260 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
| 2261 | |
| 2262 | // Scale the leading zero count down based on the actual size of the value. |
| 2263 | // Also scale it down based on the size of the shift. |
| 2264 | unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; |
| 2265 | if (MaskLZ < ScaleDown) |
| 2266 | return true; |
| 2267 | MaskLZ -= ScaleDown; |
| 2268 | |
| 2269 | // The final check is to ensure that any masked out high bits of X are |
| 2270 | // already known to be zero. Otherwise, the mask has a semantic impact |
| 2271 | // other than masking out a couple of low bits. Unfortunately, because of |
| 2272 | // the mask, zero extensions will be removed from operands in some cases. |
| 2273 | // This code works extra hard to look through extensions because we can |
| 2274 | // replace them with zero extensions cheaply if necessary. |
| 2275 | bool ReplacingAnyExtend = false; |
| 2276 | if (X.getOpcode() == ISD::ANY_EXTEND) { |
| 2277 | unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - |
| 2278 | X.getOperand(i: 0).getSimpleValueType().getSizeInBits(); |
| 2279 | // Assume that we'll replace the any-extend with a zero-extend, and |
| 2280 | // narrow the search to the extended value. |
| 2281 | X = X.getOperand(i: 0); |
| 2282 | MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; |
| 2283 | ReplacingAnyExtend = true; |
| 2284 | } |
| 2285 | APInt MaskedHighBits = |
| 2286 | APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ); |
| 2287 | if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits)) |
| 2288 | return true; |
| 2289 | |
| 2290 | // We've identified a pattern that can be transformed into a single shift |
| 2291 | // and an addressing mode. Make it so. |
| 2292 | MVT VT = N.getSimpleValueType(); |
| 2293 | if (ReplacingAnyExtend) { |
| 2294 | assert(X.getValueType() != VT); |
| 2295 | // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. |
| 2296 | SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X); |
| 2297 | insertDAGNode(DAG, Pos: N, N: NewX); |
| 2298 | X = NewX; |
| 2299 | } |
| 2300 | |
| 2301 | MVT XVT = X.getSimpleValueType(); |
| 2302 | SDLoc DL(N); |
| 2303 | SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8); |
| 2304 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
| 2305 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT); |
| 2306 | SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8); |
| 2307 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
| 2308 | |
| 2309 | // Insert the new nodes into the topological ordering. We must do this in |
| 2310 | // a valid topological ordering as nothing is going to go back and re-sort |
| 2311 | // these nodes. We continually insert before 'N' in sequence as this is |
| 2312 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
| 2313 | // hierarchy left to express. |
| 2314 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
| 2315 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
| 2316 | insertDAGNode(DAG, Pos: N, N: NewExt); |
| 2317 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
| 2318 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
| 2319 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
| 2320 | DAG.RemoveDeadNode(N: N.getNode()); |
| 2321 | |
| 2322 | AM.Scale = 1 << AMShiftAmt; |
| 2323 | AM.IndexReg = NewExt; |
| 2324 | return false; |
| 2325 | } |
| 2326 | |
| 2327 | // Transform "(X >> SHIFT) & (MASK << C1)" to |
| 2328 | // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be |
| 2329 | // matched to a BEXTR later. Returns false if the simplification is performed. |
| 2330 | static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, |
| 2331 | uint64_t Mask, |
| 2332 | SDValue Shift, SDValue X, |
| 2333 | X86ISelAddressMode &AM, |
| 2334 | const X86Subtarget &Subtarget) { |
| 2335 | if (Shift.getOpcode() != ISD::SRL || |
| 2336 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
| 2337 | !Shift.hasOneUse() || !N.hasOneUse()) |
| 2338 | return true; |
| 2339 | |
| 2340 | // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. |
| 2341 | if (!Subtarget.hasTBM() && |
| 2342 | !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) |
| 2343 | return true; |
| 2344 | |
| 2345 | // We need to ensure that mask is a continuous run of bits. |
| 2346 | unsigned MaskIdx, MaskLen; |
| 2347 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
| 2348 | return true; |
| 2349 | |
| 2350 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
| 2351 | |
| 2352 | // The amount of shift we're trying to fit into the addressing mode is taken |
| 2353 | // from the shifted mask index (number of trailing zeros of the mask). |
| 2354 | unsigned AMShiftAmt = MaskIdx; |
| 2355 | |
| 2356 | // There is nothing we can do here unless the mask is removing some bits. |
| 2357 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
| 2358 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
| 2359 | |
| 2360 | MVT XVT = X.getSimpleValueType(); |
| 2361 | MVT VT = N.getSimpleValueType(); |
| 2362 | SDLoc DL(N); |
| 2363 | SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8); |
| 2364 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
| 2365 | SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT); |
| 2366 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask); |
| 2367 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT); |
| 2368 | SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8); |
| 2369 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
| 2370 | |
| 2371 | // Insert the new nodes into the topological ordering. We must do this in |
| 2372 | // a valid topological ordering as nothing is going to go back and re-sort |
| 2373 | // these nodes. We continually insert before 'N' in sequence as this is |
| 2374 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
| 2375 | // hierarchy left to express. |
| 2376 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
| 2377 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
| 2378 | insertDAGNode(DAG, Pos: N, N: NewMask); |
| 2379 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
| 2380 | insertDAGNode(DAG, Pos: N, N: NewExt); |
| 2381 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
| 2382 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
| 2383 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
| 2384 | DAG.RemoveDeadNode(N: N.getNode()); |
| 2385 | |
| 2386 | AM.Scale = 1 << AMShiftAmt; |
| 2387 | AM.IndexReg = NewExt; |
| 2388 | return false; |
| 2389 | } |
| 2390 | |
| 2391 | // Attempt to peek further into a scaled index register, collecting additional |
| 2392 | // extensions / offsets / etc. Returns /p N if we can't peek any further. |
| 2393 | SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N, |
| 2394 | X86ISelAddressMode &AM, |
| 2395 | unsigned Depth) { |
| 2396 | assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched" ); |
| 2397 | assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) && |
| 2398 | "Illegal index scale" ); |
| 2399 | |
| 2400 | // Limit recursion. |
| 2401 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
| 2402 | return N; |
| 2403 | |
| 2404 | EVT VT = N.getValueType(); |
| 2405 | unsigned Opc = N.getOpcode(); |
| 2406 | |
| 2407 | // index: add(x,c) -> index: x, disp + c |
| 2408 | if (CurDAG->isBaseWithConstantOffset(Op: N)) { |
| 2409 | auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1)); |
| 2410 | uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale; |
| 2411 | if (!foldOffsetIntoAddress(Offset, AM)) |
| 2412 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
| 2413 | } |
| 2414 | |
| 2415 | // index: add(x,x) -> index: x, scale * 2 |
| 2416 | if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) { |
| 2417 | if (AM.Scale <= 4) { |
| 2418 | AM.Scale *= 2; |
| 2419 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
| 2420 | } |
| 2421 | } |
| 2422 | |
| 2423 | // index: shl(x,i) -> index: x, scale * (1 << i) |
| 2424 | if (Opc == X86ISD::VSHLI) { |
| 2425 | uint64_t ShiftAmt = N.getConstantOperandVal(i: 1); |
| 2426 | uint64_t ScaleAmt = 1ULL << ShiftAmt; |
| 2427 | if ((AM.Scale * ScaleAmt) <= 8) { |
| 2428 | AM.Scale *= ScaleAmt; |
| 2429 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
| 2430 | } |
| 2431 | } |
| 2432 | |
| 2433 | // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c) |
| 2434 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
| 2435 | if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) { |
| 2436 | SDValue Src = N.getOperand(i: 0); |
| 2437 | if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() && |
| 2438 | Src.hasOneUse()) { |
| 2439 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
| 2440 | SDValue AddSrc = Src.getOperand(i: 0); |
| 2441 | auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1)); |
| 2442 | int64_t Offset = AddVal->getSExtValue(); |
| 2443 | if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) { |
| 2444 | SDLoc DL(N); |
| 2445 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
| 2446 | SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT); |
| 2447 | SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal); |
| 2448 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
| 2449 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
| 2450 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
| 2451 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
| 2452 | CurDAG->RemoveDeadNode(N: N.getNode()); |
| 2453 | return ExtSrc; |
| 2454 | } |
| 2455 | } |
| 2456 | } |
| 2457 | } |
| 2458 | |
| 2459 | // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c) |
| 2460 | // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c) |
| 2461 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
| 2462 | if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) { |
| 2463 | SDValue Src = N.getOperand(i: 0); |
| 2464 | unsigned SrcOpc = Src.getOpcode(); |
| 2465 | if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) || |
| 2466 | CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) && |
| 2467 | Src.hasOneUse()) { |
| 2468 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
| 2469 | SDValue AddSrc = Src.getOperand(i: 0); |
| 2470 | uint64_t Offset = Src.getConstantOperandVal(i: 1); |
| 2471 | if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) { |
| 2472 | SDLoc DL(N); |
| 2473 | SDValue Res; |
| 2474 | // If we're also scaling, see if we can use that as well. |
| 2475 | if (AddSrc.getOpcode() == ISD::SHL && |
| 2476 | isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) { |
| 2477 | SDValue ShVal = AddSrc.getOperand(i: 0); |
| 2478 | uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1); |
| 2479 | APInt HiBits = |
| 2480 | APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt); |
| 2481 | uint64_t ScaleAmt = 1ULL << ShAmt; |
| 2482 | if ((AM.Scale * ScaleAmt) <= 8 && |
| 2483 | (AddSrc->getFlags().hasNoUnsignedWrap() || |
| 2484 | CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) { |
| 2485 | AM.Scale *= ScaleAmt; |
| 2486 | SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal); |
| 2487 | SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal, |
| 2488 | N2: AddSrc.getOperand(i: 1)); |
| 2489 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal); |
| 2490 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift); |
| 2491 | AddSrc = ExtShift; |
| 2492 | Res = ExtShVal; |
| 2493 | } |
| 2494 | } |
| 2495 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
| 2496 | SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT); |
| 2497 | SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal); |
| 2498 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
| 2499 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
| 2500 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
| 2501 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
| 2502 | CurDAG->RemoveDeadNode(N: N.getNode()); |
| 2503 | return Res ? Res : ExtSrc; |
| 2504 | } |
| 2505 | } |
| 2506 | } |
| 2507 | } |
| 2508 | |
| 2509 | // TODO: Handle extensions, shifted masks etc. |
| 2510 | return N; |
| 2511 | } |
| 2512 | |
| 2513 | bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
| 2514 | unsigned Depth) { |
| 2515 | LLVM_DEBUG({ |
| 2516 | dbgs() << "MatchAddress: " ; |
| 2517 | AM.dump(CurDAG); |
| 2518 | }); |
| 2519 | // Limit recursion. |
| 2520 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
| 2521 | return matchAddressBase(N, AM); |
| 2522 | |
| 2523 | // If this is already a %rip relative address, we can only merge immediates |
| 2524 | // into it. Instead of handling this in every case, we handle it here. |
| 2525 | // RIP relative addressing: %rip + 32-bit displacement! |
| 2526 | if (AM.isRIPRelative()) { |
| 2527 | // FIXME: JumpTable and ExternalSymbol address currently don't like |
| 2528 | // displacements. It isn't very important, but this should be fixed for |
| 2529 | // consistency. |
| 2530 | if (!(AM.ES || AM.MCSym) && AM.JT != -1) |
| 2531 | return true; |
| 2532 | |
| 2533 | if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N)) |
| 2534 | if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM)) |
| 2535 | return false; |
| 2536 | return true; |
| 2537 | } |
| 2538 | |
| 2539 | switch (N.getOpcode()) { |
| 2540 | default: break; |
| 2541 | case ISD::LOCAL_RECOVER: { |
| 2542 | if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) |
| 2543 | if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) { |
| 2544 | // Use the symbol and don't prefix it. |
| 2545 | AM.MCSym = ESNode->getMCSymbol(); |
| 2546 | return false; |
| 2547 | } |
| 2548 | break; |
| 2549 | } |
| 2550 | case ISD::Constant: { |
| 2551 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
| 2552 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
| 2553 | return false; |
| 2554 | break; |
| 2555 | } |
| 2556 | |
| 2557 | case X86ISD::Wrapper: |
| 2558 | case X86ISD::WrapperRIP: |
| 2559 | if (!matchWrapper(N, AM)) |
| 2560 | return false; |
| 2561 | break; |
| 2562 | |
| 2563 | case ISD::LOAD: |
| 2564 | if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM)) |
| 2565 | return false; |
| 2566 | break; |
| 2567 | |
| 2568 | case ISD::FrameIndex: |
| 2569 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
| 2570 | AM.Base_Reg.getNode() == nullptr && |
| 2571 | (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) { |
| 2572 | AM.BaseType = X86ISelAddressMode::FrameIndexBase; |
| 2573 | AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex(); |
| 2574 | return false; |
| 2575 | } |
| 2576 | break; |
| 2577 | |
| 2578 | case ISD::SHL: |
| 2579 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
| 2580 | break; |
| 2581 | |
| 2582 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) { |
| 2583 | unsigned Val = CN->getZExtValue(); |
| 2584 | // Note that we handle x<<1 as (,x,2) rather than (x,x) here so |
| 2585 | // that the base operand remains free for further matching. If |
| 2586 | // the base doesn't end up getting used, a post-processing step |
| 2587 | // in MatchAddress turns (,x,2) into (x,x), which is cheaper. |
| 2588 | if (Val == 1 || Val == 2 || Val == 3) { |
| 2589 | SDValue ShVal = N.getOperand(i: 0); |
| 2590 | AM.Scale = 1 << Val; |
| 2591 | AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1); |
| 2592 | return false; |
| 2593 | } |
| 2594 | } |
| 2595 | break; |
| 2596 | |
| 2597 | case ISD::SRL: { |
| 2598 | // Scale must not be used already. |
| 2599 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
| 2600 | |
| 2601 | // We only handle up to 64-bit values here as those are what matter for |
| 2602 | // addressing mode optimizations. |
| 2603 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
| 2604 | "Unexpected value size!" ); |
| 2605 | |
| 2606 | SDValue And = N.getOperand(i: 0); |
| 2607 | if (And.getOpcode() != ISD::AND) break; |
| 2608 | SDValue X = And.getOperand(i: 0); |
| 2609 | |
| 2610 | // The mask used for the transform is expected to be post-shift, but we |
| 2611 | // found the shift first so just apply the shift to the mask before passing |
| 2612 | // it down. |
| 2613 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) || |
| 2614 | !isa<ConstantSDNode>(Val: And.getOperand(i: 1))) |
| 2615 | break; |
| 2616 | uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1); |
| 2617 | |
| 2618 | // Try to fold the mask and shift into the scale, and return false if we |
| 2619 | // succeed. |
| 2620 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM)) |
| 2621 | return false; |
| 2622 | break; |
| 2623 | } |
| 2624 | |
| 2625 | case ISD::SMUL_LOHI: |
| 2626 | case ISD::UMUL_LOHI: |
| 2627 | // A mul_lohi where we need the low part can be folded as a plain multiply. |
| 2628 | if (N.getResNo() != 0) break; |
| 2629 | [[fallthrough]]; |
| 2630 | case ISD::MUL: |
| 2631 | case X86ISD::MUL_IMM: |
| 2632 | // X*[3,5,9] -> X+X*[2,4,8] |
| 2633 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
| 2634 | AM.Base_Reg.getNode() == nullptr && |
| 2635 | AM.IndexReg.getNode() == nullptr) { |
| 2636 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) |
| 2637 | if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || |
| 2638 | CN->getZExtValue() == 9) { |
| 2639 | AM.Scale = unsigned(CN->getZExtValue())-1; |
| 2640 | |
| 2641 | SDValue MulVal = N.getOperand(i: 0); |
| 2642 | SDValue Reg; |
| 2643 | |
| 2644 | // Okay, we know that we have a scale by now. However, if the scaled |
| 2645 | // value is an add of something and a constant, we can fold the |
| 2646 | // constant into the disp field here. |
| 2647 | if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && |
| 2648 | isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) { |
| 2649 | Reg = MulVal.getOperand(i: 0); |
| 2650 | auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1)); |
| 2651 | uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); |
| 2652 | if (foldOffsetIntoAddress(Offset: Disp, AM)) |
| 2653 | Reg = N.getOperand(i: 0); |
| 2654 | } else { |
| 2655 | Reg = N.getOperand(i: 0); |
| 2656 | } |
| 2657 | |
| 2658 | AM.IndexReg = AM.Base_Reg = Reg; |
| 2659 | return false; |
| 2660 | } |
| 2661 | } |
| 2662 | break; |
| 2663 | |
| 2664 | case ISD::SUB: { |
| 2665 | // Given A-B, if A can be completely folded into the address and |
| 2666 | // the index field with the index field unused, use -B as the index. |
| 2667 | // This is a win if a has multiple parts that can be folded into |
| 2668 | // the address. Also, this saves a mov if the base register has |
| 2669 | // other uses, since it avoids a two-address sub instruction, however |
| 2670 | // it costs an additional mov if the index register has other uses. |
| 2671 | |
| 2672 | // Add an artificial use to this node so that we can keep track of |
| 2673 | // it if it gets CSE'd with a different node. |
| 2674 | HandleSDNode Handle(N); |
| 2675 | |
| 2676 | // Test if the LHS of the sub can be folded. |
| 2677 | X86ISelAddressMode Backup = AM; |
| 2678 | if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) { |
| 2679 | N = Handle.getValue(); |
| 2680 | AM = Backup; |
| 2681 | break; |
| 2682 | } |
| 2683 | N = Handle.getValue(); |
| 2684 | // Test if the index field is free for use. |
| 2685 | if (AM.IndexReg.getNode() || AM.isRIPRelative()) { |
| 2686 | AM = Backup; |
| 2687 | break; |
| 2688 | } |
| 2689 | |
| 2690 | int Cost = 0; |
| 2691 | SDValue RHS = N.getOperand(i: 1); |
| 2692 | // If the RHS involves a register with multiple uses, this |
| 2693 | // transformation incurs an extra mov, due to the neg instruction |
| 2694 | // clobbering its operand. |
| 2695 | if (!RHS.getNode()->hasOneUse() || |
| 2696 | RHS.getNode()->getOpcode() == ISD::CopyFromReg || |
| 2697 | RHS.getNode()->getOpcode() == ISD::TRUNCATE || |
| 2698 | RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || |
| 2699 | (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && |
| 2700 | RHS.getOperand(i: 0).getValueType() == MVT::i32)) |
| 2701 | ++Cost; |
| 2702 | // If the base is a register with multiple uses, this |
| 2703 | // transformation may save a mov. |
| 2704 | if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && |
| 2705 | !AM.Base_Reg.getNode()->hasOneUse()) || |
| 2706 | AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
| 2707 | --Cost; |
| 2708 | // If the folded LHS was interesting, this transformation saves |
| 2709 | // address arithmetic. |
| 2710 | if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + |
| 2711 | ((AM.Disp != 0) && (Backup.Disp == 0)) + |
| 2712 | (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) |
| 2713 | --Cost; |
| 2714 | // If it doesn't look like it may be an overall win, don't do it. |
| 2715 | if (Cost >= 0) { |
| 2716 | AM = Backup; |
| 2717 | break; |
| 2718 | } |
| 2719 | |
| 2720 | // Ok, the transformation is legal and appears profitable. Go for it. |
| 2721 | // Negation will be emitted later to avoid creating dangling nodes if this |
| 2722 | // was an unprofitable LEA. |
| 2723 | AM.IndexReg = RHS; |
| 2724 | AM.NegateIndex = true; |
| 2725 | AM.Scale = 1; |
| 2726 | return false; |
| 2727 | } |
| 2728 | |
| 2729 | case ISD::OR: |
| 2730 | case ISD::XOR: |
| 2731 | // See if we can treat the OR/XOR node as an ADD node. |
| 2732 | if (!CurDAG->isADDLike(Op: N)) |
| 2733 | break; |
| 2734 | [[fallthrough]]; |
| 2735 | case ISD::ADD: |
| 2736 | if (!matchAdd(N, AM, Depth)) |
| 2737 | return false; |
| 2738 | break; |
| 2739 | |
| 2740 | case ISD::AND: { |
| 2741 | // Perform some heroic transforms on an and of a constant-count shift |
| 2742 | // with a constant to enable use of the scaled offset field. |
| 2743 | |
| 2744 | // Scale must not be used already. |
| 2745 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
| 2746 | |
| 2747 | // We only handle up to 64-bit values here as those are what matter for |
| 2748 | // addressing mode optimizations. |
| 2749 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
| 2750 | "Unexpected value size!" ); |
| 2751 | |
| 2752 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1))) |
| 2753 | break; |
| 2754 | |
| 2755 | if (N.getOperand(i: 0).getOpcode() == ISD::SRL) { |
| 2756 | SDValue Shift = N.getOperand(i: 0); |
| 2757 | SDValue X = Shift.getOperand(i: 0); |
| 2758 | |
| 2759 | uint64_t Mask = N.getConstantOperandVal(i: 1); |
| 2760 | |
| 2761 | // Try to fold the mask and shift into an extract and scale. |
| 2762 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
| 2763 | return false; |
| 2764 | |
| 2765 | // Try to fold the mask and shift directly into the scale. |
| 2766 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
| 2767 | return false; |
| 2768 | |
| 2769 | // Try to fold the mask and shift into BEXTR and scale. |
| 2770 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget)) |
| 2771 | return false; |
| 2772 | } |
| 2773 | |
| 2774 | // Try to swap the mask and shift to place shifts which can be done as |
| 2775 | // a scale on the outside of the mask. |
| 2776 | if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM)) |
| 2777 | return false; |
| 2778 | |
| 2779 | break; |
| 2780 | } |
| 2781 | case ISD::ZERO_EXTEND: { |
| 2782 | // Try to widen a zexted shift left to the same size as its use, so we can |
| 2783 | // match the shift as a scale factor. |
| 2784 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
| 2785 | break; |
| 2786 | |
| 2787 | SDValue Src = N.getOperand(i: 0); |
| 2788 | |
| 2789 | // See if we can match a zext(addlike(x,c)). |
| 2790 | // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively. |
| 2791 | if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR) |
| 2792 | if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1)) |
| 2793 | if (Index != N) { |
| 2794 | AM.IndexReg = Index; |
| 2795 | return false; |
| 2796 | } |
| 2797 | |
| 2798 | // Peek through mask: zext(and(shl(x,c1),c2)) |
| 2799 | APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits()); |
| 2800 | if (Src.getOpcode() == ISD::AND && Src.hasOneUse()) |
| 2801 | if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) { |
| 2802 | Mask = MaskC->getAPIntValue(); |
| 2803 | Src = Src.getOperand(i: 0); |
| 2804 | } |
| 2805 | |
| 2806 | if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) { |
| 2807 | // Give up if the shift is not a valid scale factor [1,2,3]. |
| 2808 | SDValue ShlSrc = Src.getOperand(i: 0); |
| 2809 | SDValue ShlAmt = Src.getOperand(i: 1); |
| 2810 | auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt); |
| 2811 | if (!ShAmtC) |
| 2812 | break; |
| 2813 | unsigned ShAmtV = ShAmtC->getZExtValue(); |
| 2814 | if (ShAmtV > 3) |
| 2815 | break; |
| 2816 | |
| 2817 | // The narrow shift must only shift out zero bits (it must be 'nuw'). |
| 2818 | // That makes it safe to widen to the destination type. |
| 2819 | APInt HighZeros = |
| 2820 | APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV); |
| 2821 | if (!Src->getFlags().hasNoUnsignedWrap() && |
| 2822 | !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask)) |
| 2823 | break; |
| 2824 | |
| 2825 | // zext (shl nuw i8 %x, C1) to i32 |
| 2826 | // --> shl (zext i8 %x to i32), (zext C1) |
| 2827 | // zext (and (shl nuw i8 %x, C1), C2) to i32 |
| 2828 | // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1) |
| 2829 | MVT SrcVT = ShlSrc.getSimpleValueType(); |
| 2830 | MVT VT = N.getSimpleValueType(); |
| 2831 | SDLoc DL(N); |
| 2832 | |
| 2833 | SDValue Res = ShlSrc; |
| 2834 | if (!Mask.isAllOnes()) { |
| 2835 | Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT); |
| 2836 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
| 2837 | Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res); |
| 2838 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
| 2839 | } |
| 2840 | SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res); |
| 2841 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext); |
| 2842 | SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt); |
| 2843 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl); |
| 2844 | CurDAG->ReplaceAllUsesWith(From: N, To: NewShl); |
| 2845 | CurDAG->RemoveDeadNode(N: N.getNode()); |
| 2846 | |
| 2847 | // Convert the shift to scale factor. |
| 2848 | AM.Scale = 1 << ShAmtV; |
| 2849 | // If matchIndexRecursively is not called here, |
| 2850 | // Zext may be replaced by other nodes but later used to call a builder |
| 2851 | // method |
| 2852 | AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1); |
| 2853 | return false; |
| 2854 | } |
| 2855 | |
| 2856 | if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) { |
| 2857 | // Try to fold the mask and shift into an extract and scale. |
| 2858 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
| 2859 | X: Src.getOperand(i: 0), AM)) |
| 2860 | return false; |
| 2861 | |
| 2862 | // Try to fold the mask and shift directly into the scale. |
| 2863 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
| 2864 | X: Src.getOperand(i: 0), AM)) |
| 2865 | return false; |
| 2866 | |
| 2867 | // Try to fold the mask and shift into BEXTR and scale. |
| 2868 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
| 2869 | X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget)) |
| 2870 | return false; |
| 2871 | } |
| 2872 | |
| 2873 | break; |
| 2874 | } |
| 2875 | } |
| 2876 | |
| 2877 | return matchAddressBase(N, AM); |
| 2878 | } |
| 2879 | |
| 2880 | /// Helper for MatchAddress. Add the specified node to the |
| 2881 | /// specified addressing mode without any further recursion. |
| 2882 | bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { |
| 2883 | // Is the base register already occupied? |
| 2884 | if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { |
| 2885 | // If so, check to see if the scale index register is set. |
| 2886 | if (!AM.IndexReg.getNode()) { |
| 2887 | AM.IndexReg = N; |
| 2888 | AM.Scale = 1; |
| 2889 | return false; |
| 2890 | } |
| 2891 | |
| 2892 | // Otherwise, we cannot select it. |
| 2893 | return true; |
| 2894 | } |
| 2895 | |
| 2896 | // Default, generate it as a register. |
| 2897 | AM.BaseType = X86ISelAddressMode::RegBase; |
| 2898 | AM.Base_Reg = N; |
| 2899 | return false; |
| 2900 | } |
| 2901 | |
| 2902 | bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N, |
| 2903 | X86ISelAddressMode &AM, |
| 2904 | unsigned Depth) { |
| 2905 | LLVM_DEBUG({ |
| 2906 | dbgs() << "MatchVectorAddress: " ; |
| 2907 | AM.dump(CurDAG); |
| 2908 | }); |
| 2909 | // Limit recursion. |
| 2910 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
| 2911 | return matchAddressBase(N, AM); |
| 2912 | |
| 2913 | // TODO: Support other operations. |
| 2914 | switch (N.getOpcode()) { |
| 2915 | case ISD::Constant: { |
| 2916 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
| 2917 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
| 2918 | return false; |
| 2919 | break; |
| 2920 | } |
| 2921 | case X86ISD::Wrapper: |
| 2922 | if (!matchWrapper(N, AM)) |
| 2923 | return false; |
| 2924 | break; |
| 2925 | case ISD::ADD: { |
| 2926 | // Add an artificial use to this node so that we can keep track of |
| 2927 | // it if it gets CSE'd with a different node. |
| 2928 | HandleSDNode Handle(N); |
| 2929 | |
| 2930 | X86ISelAddressMode Backup = AM; |
| 2931 | if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) && |
| 2932 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
| 2933 | Depth: Depth + 1)) |
| 2934 | return false; |
| 2935 | AM = Backup; |
| 2936 | |
| 2937 | // Try again after commuting the operands. |
| 2938 | if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
| 2939 | Depth: Depth + 1) && |
| 2940 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, |
| 2941 | Depth: Depth + 1)) |
| 2942 | return false; |
| 2943 | AM = Backup; |
| 2944 | |
| 2945 | N = Handle.getValue(); |
| 2946 | break; |
| 2947 | } |
| 2948 | } |
| 2949 | |
| 2950 | return matchAddressBase(N, AM); |
| 2951 | } |
| 2952 | |
| 2953 | /// Helper for selectVectorAddr. Handles things that can be folded into a |
| 2954 | /// gather/scatter address. The index register and scale should have already |
| 2955 | /// been handled. |
| 2956 | bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { |
| 2957 | return matchVectorAddressRecursively(N, AM, Depth: 0); |
| 2958 | } |
| 2959 | |
| 2960 | bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, |
| 2961 | SDValue IndexOp, SDValue ScaleOp, |
| 2962 | SDValue &Base, SDValue &Scale, |
| 2963 | SDValue &Index, SDValue &Disp, |
| 2964 | SDValue &Segment) { |
| 2965 | X86ISelAddressMode AM; |
| 2966 | AM.Scale = ScaleOp->getAsZExtVal(); |
| 2967 | |
| 2968 | // Attempt to match index patterns, as long as we're not relying on implicit |
| 2969 | // sign-extension, which is performed BEFORE scale. |
| 2970 | if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits()) |
| 2971 | AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0); |
| 2972 | else |
| 2973 | AM.IndexReg = IndexOp; |
| 2974 | |
| 2975 | unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); |
| 2976 | if (AddrSpace == X86AS::GS) |
| 2977 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
| 2978 | if (AddrSpace == X86AS::FS) |
| 2979 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
| 2980 | if (AddrSpace == X86AS::SS) |
| 2981 | AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16); |
| 2982 | |
| 2983 | SDLoc DL(BasePtr); |
| 2984 | MVT VT = BasePtr.getSimpleValueType(); |
| 2985 | |
| 2986 | // Try to match into the base and displacement fields. |
| 2987 | if (matchVectorAddress(N: BasePtr, AM)) |
| 2988 | return false; |
| 2989 | |
| 2990 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
| 2991 | return true; |
| 2992 | } |
| 2993 | |
| 2994 | /// Returns true if it is able to pattern match an addressing mode. |
| 2995 | /// It returns the operands which make up the maximal addressing mode it can |
| 2996 | /// match by reference. |
| 2997 | /// |
| 2998 | /// Parent is the parent node of the addr operand that is being matched. It |
| 2999 | /// is always a load, store, atomic node, or null. It is only null when |
| 3000 | /// checking memory operands for inline asm nodes. |
| 3001 | bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
| 3002 | SDValue &Scale, SDValue &Index, |
| 3003 | SDValue &Disp, SDValue &Segment) { |
| 3004 | X86ISelAddressMode AM; |
| 3005 | |
| 3006 | if (Parent && |
| 3007 | // This list of opcodes are all the nodes that have an "addr:$ptr" operand |
| 3008 | // that are not a MemSDNode, and thus don't have proper addrspace info. |
| 3009 | Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme |
| 3010 | Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores |
| 3011 | Parent->getOpcode() != X86ISD::TLSCALL && // Fixme |
| 3012 | Parent->getOpcode() != X86ISD::ENQCMD && // Fixme |
| 3013 | Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme |
| 3014 | Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp |
| 3015 | Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp |
| 3016 | unsigned AddrSpace = |
| 3017 | cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace(); |
| 3018 | if (AddrSpace == X86AS::GS) |
| 3019 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
| 3020 | if (AddrSpace == X86AS::FS) |
| 3021 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
| 3022 | if (AddrSpace == X86AS::SS) |
| 3023 | AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16); |
| 3024 | } |
| 3025 | |
| 3026 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
| 3027 | SDLoc DL(N); |
| 3028 | MVT VT = N.getSimpleValueType(); |
| 3029 | |
| 3030 | if (matchAddress(N, AM)) |
| 3031 | return false; |
| 3032 | |
| 3033 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
| 3034 | return true; |
| 3035 | } |
| 3036 | |
| 3037 | bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { |
| 3038 | // Cannot use 32 bit constants to reference objects in kernel/large code |
| 3039 | // model. |
| 3040 | if (TM.getCodeModel() == CodeModel::Kernel || |
| 3041 | TM.getCodeModel() == CodeModel::Large) |
| 3042 | return false; |
| 3043 | |
| 3044 | // In static codegen with small code model, we can get the address of a label |
| 3045 | // into a register with 'movl' |
| 3046 | if (N->getOpcode() != X86ISD::Wrapper) |
| 3047 | return false; |
| 3048 | |
| 3049 | N = N.getOperand(i: 0); |
| 3050 | |
| 3051 | // At least GNU as does not accept 'movl' for TPOFF relocations. |
| 3052 | // FIXME: We could use 'movl' when we know we are targeting MC. |
| 3053 | if (N->getOpcode() == ISD::TargetGlobalTLSAddress) |
| 3054 | return false; |
| 3055 | |
| 3056 | Imm = N; |
| 3057 | // Small/medium code model can reference non-TargetGlobalAddress objects with |
| 3058 | // 32 bit constants. |
| 3059 | if (N->getOpcode() != ISD::TargetGlobalAddress) { |
| 3060 | return TM.getCodeModel() == CodeModel::Small || |
| 3061 | TM.getCodeModel() == CodeModel::Medium; |
| 3062 | } |
| 3063 | |
| 3064 | const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal(); |
| 3065 | if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) |
| 3066 | return CR->getUnsignedMax().ult(RHS: 1ull << 32); |
| 3067 | |
| 3068 | return !TM.isLargeGlobalValue(GV); |
| 3069 | } |
| 3070 | |
| 3071 | bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale, |
| 3072 | SDValue &Index, SDValue &Disp, |
| 3073 | SDValue &Segment) { |
| 3074 | // Save the debug loc before calling selectLEAAddr, in case it invalidates N. |
| 3075 | SDLoc DL(N); |
| 3076 | |
| 3077 | if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) |
| 3078 | return false; |
| 3079 | |
| 3080 | EVT BaseType = Base.getValueType(); |
| 3081 | unsigned SubReg; |
| 3082 | if (BaseType == MVT::i8) |
| 3083 | SubReg = X86::sub_8bit; |
| 3084 | else if (BaseType == MVT::i16) |
| 3085 | SubReg = X86::sub_16bit; |
| 3086 | else |
| 3087 | SubReg = X86::sub_32bit; |
| 3088 | |
| 3089 | auto *RN = dyn_cast<RegisterSDNode>(Val&: Base); |
| 3090 | if (RN && RN->getReg() == 0) |
| 3091 | Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64); |
| 3092 | else if ((BaseType == MVT::i8 || BaseType == MVT::i16 || |
| 3093 | BaseType == MVT::i32) && |
| 3094 | !isa<FrameIndexSDNode>(Val: Base)) { |
| 3095 | // Base could already be %rip, particularly in the x32 ABI. |
| 3096 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL, |
| 3097 | VT: MVT::i64), 0); |
| 3098 | Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base); |
| 3099 | } |
| 3100 | |
| 3101 | [[maybe_unused]] EVT IndexType = Index.getValueType(); |
| 3102 | RN = dyn_cast<RegisterSDNode>(Val&: Index); |
| 3103 | if (RN && RN->getReg() == 0) |
| 3104 | Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64); |
| 3105 | else { |
| 3106 | assert((IndexType == BaseType) && |
| 3107 | "Expect to be extending 8/16/32-bit registers for use in LEA" ); |
| 3108 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL, |
| 3109 | VT: MVT::i64), 0); |
| 3110 | Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index); |
| 3111 | } |
| 3112 | |
| 3113 | return true; |
| 3114 | } |
| 3115 | |
| 3116 | /// Calls SelectAddr and determines if the maximal addressing |
| 3117 | /// mode it matches can be cost effectively emitted as an LEA instruction. |
| 3118 | bool X86DAGToDAGISel::selectLEAAddr(SDValue N, |
| 3119 | SDValue &Base, SDValue &Scale, |
| 3120 | SDValue &Index, SDValue &Disp, |
| 3121 | SDValue &Segment) { |
| 3122 | X86ISelAddressMode AM; |
| 3123 | |
| 3124 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
| 3125 | SDLoc DL(N); |
| 3126 | MVT VT = N.getSimpleValueType(); |
| 3127 | |
| 3128 | // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support |
| 3129 | // segments. |
| 3130 | SDValue Copy = AM.Segment; |
| 3131 | SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32); |
| 3132 | AM.Segment = T; |
| 3133 | if (matchAddress(N, AM)) |
| 3134 | return false; |
| 3135 | assert (T == AM.Segment); |
| 3136 | AM.Segment = Copy; |
| 3137 | |
| 3138 | unsigned Complexity = 0; |
| 3139 | if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) |
| 3140 | Complexity = 1; |
| 3141 | else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
| 3142 | Complexity = 4; |
| 3143 | |
| 3144 | if (AM.IndexReg.getNode()) |
| 3145 | Complexity++; |
| 3146 | |
| 3147 | // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with |
| 3148 | // a simple shift. |
| 3149 | if (AM.Scale > 1) |
| 3150 | Complexity++; |
| 3151 | |
| 3152 | // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA |
| 3153 | // to a LEA. This is determined with some experimentation but is by no means |
| 3154 | // optimal (especially for code size consideration). LEA is nice because of |
| 3155 | // its three-address nature. Tweak the cost function again when we can run |
| 3156 | // convertToThreeAddress() at register allocation time. |
| 3157 | if (AM.hasSymbolicDisplacement()) { |
| 3158 | // For X86-64, always use LEA to materialize RIP-relative addresses. |
| 3159 | if (Subtarget->is64Bit()) |
| 3160 | Complexity = 4; |
| 3161 | else |
| 3162 | Complexity += 2; |
| 3163 | } |
| 3164 | |
| 3165 | // Heuristic: try harder to form an LEA from ADD if the operands set flags. |
| 3166 | // Unlike ADD, LEA does not affect flags, so we will be less likely to require |
| 3167 | // duplicating flag-producing instructions later in the pipeline. |
| 3168 | if (N.getOpcode() == ISD::ADD) { |
| 3169 | auto isMathWithFlags = [](SDValue V) { |
| 3170 | switch (V.getOpcode()) { |
| 3171 | case X86ISD::ADD: |
| 3172 | case X86ISD::SUB: |
| 3173 | case X86ISD::ADC: |
| 3174 | case X86ISD::SBB: |
| 3175 | case X86ISD::SMUL: |
| 3176 | case X86ISD::UMUL: |
| 3177 | /* TODO: These opcodes can be added safely, but we may want to justify |
| 3178 | their inclusion for different reasons (better for reg-alloc). |
| 3179 | case X86ISD::OR: |
| 3180 | case X86ISD::XOR: |
| 3181 | case X86ISD::AND: |
| 3182 | */ |
| 3183 | // Value 1 is the flag output of the node - verify it's not dead. |
| 3184 | return !SDValue(V.getNode(), 1).use_empty(); |
| 3185 | default: |
| 3186 | return false; |
| 3187 | } |
| 3188 | }; |
| 3189 | // TODO: We might want to factor in whether there's a load folding |
| 3190 | // opportunity for the math op that disappears with LEA. |
| 3191 | if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1))) |
| 3192 | Complexity++; |
| 3193 | } |
| 3194 | |
| 3195 | if (AM.Disp) |
| 3196 | Complexity++; |
| 3197 | |
| 3198 | // If it isn't worth using an LEA, reject it. |
| 3199 | if (Complexity <= 2) |
| 3200 | return false; |
| 3201 | |
| 3202 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
| 3203 | return true; |
| 3204 | } |
| 3205 | |
| 3206 | /// This is only run on TargetGlobalTLSAddress nodes. |
| 3207 | bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, |
| 3208 | SDValue &Scale, SDValue &Index, |
| 3209 | SDValue &Disp, SDValue &Segment) { |
| 3210 | assert(N.getOpcode() == ISD::TargetGlobalTLSAddress || |
| 3211 | N.getOpcode() == ISD::TargetExternalSymbol); |
| 3212 | |
| 3213 | X86ISelAddressMode AM; |
| 3214 | if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) { |
| 3215 | AM.GV = GA->getGlobal(); |
| 3216 | AM.Disp += GA->getOffset(); |
| 3217 | AM.SymbolFlags = GA->getTargetFlags(); |
| 3218 | } else { |
| 3219 | auto *SA = cast<ExternalSymbolSDNode>(Val&: N); |
| 3220 | AM.ES = SA->getSymbol(); |
| 3221 | AM.SymbolFlags = SA->getTargetFlags(); |
| 3222 | } |
| 3223 | |
| 3224 | if (Subtarget->is32Bit()) { |
| 3225 | AM.Scale = 1; |
| 3226 | AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32); |
| 3227 | } |
| 3228 | |
| 3229 | MVT VT = N.getSimpleValueType(); |
| 3230 | getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment); |
| 3231 | return true; |
| 3232 | } |
| 3233 | |
| 3234 | bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { |
| 3235 | // Keep track of the original value type and whether this value was |
| 3236 | // truncated. If we see a truncation from pointer type to VT that truncates |
| 3237 | // bits that are known to be zero, we can use a narrow reference. |
| 3238 | EVT VT = N.getValueType(); |
| 3239 | bool WasTruncated = false; |
| 3240 | if (N.getOpcode() == ISD::TRUNCATE) { |
| 3241 | WasTruncated = true; |
| 3242 | N = N.getOperand(i: 0); |
| 3243 | } |
| 3244 | |
| 3245 | if (N.getOpcode() != X86ISD::Wrapper) |
| 3246 | return false; |
| 3247 | |
| 3248 | // We can only use non-GlobalValues as immediates if they were not truncated, |
| 3249 | // as we do not have any range information. If we have a GlobalValue and the |
| 3250 | // address was not truncated, we can select it as an operand directly. |
| 3251 | unsigned Opc = N.getOperand(i: 0)->getOpcode(); |
| 3252 | if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { |
| 3253 | Op = N.getOperand(i: 0); |
| 3254 | // We can only select the operand directly if we didn't have to look past a |
| 3255 | // truncate. |
| 3256 | return !WasTruncated; |
| 3257 | } |
| 3258 | |
| 3259 | // Check that the global's range fits into VT. |
| 3260 | auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0)); |
| 3261 | std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); |
| 3262 | if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits())) |
| 3263 | return false; |
| 3264 | |
| 3265 | // Okay, we can use a narrow reference. |
| 3266 | Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT, |
| 3267 | offset: GA->getOffset(), TargetFlags: GA->getTargetFlags()); |
| 3268 | return true; |
| 3269 | } |
| 3270 | |
| 3271 | bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
| 3272 | SDValue &Base, SDValue &Scale, |
| 3273 | SDValue &Index, SDValue &Disp, |
| 3274 | SDValue &Segment) { |
| 3275 | assert(Root && P && "Unknown root/parent nodes" ); |
| 3276 | if (!ISD::isNON_EXTLoad(N: N.getNode()) || |
| 3277 | !IsProfitableToFold(N, U: P, Root) || |
| 3278 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
| 3279 | return false; |
| 3280 | |
| 3281 | return selectAddr(Parent: N.getNode(), |
| 3282 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
| 3283 | } |
| 3284 | |
| 3285 | bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
| 3286 | SDValue &Base, SDValue &Scale, |
| 3287 | SDValue &Index, SDValue &Disp, |
| 3288 | SDValue &Segment) { |
| 3289 | assert(Root && P && "Unknown root/parent nodes" ); |
| 3290 | if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || |
| 3291 | !IsProfitableToFold(N, U: P, Root) || |
| 3292 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
| 3293 | return false; |
| 3294 | |
| 3295 | return selectAddr(Parent: N.getNode(), |
| 3296 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
| 3297 | } |
| 3298 | |
| 3299 | /// Return an SDNode that returns the value of the global base register. |
| 3300 | /// Output instructions required to initialize the global base register, |
| 3301 | /// if necessary. |
| 3302 | SDNode *X86DAGToDAGISel::getGlobalBaseReg() { |
| 3303 | Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); |
| 3304 | auto &DL = MF->getDataLayout(); |
| 3305 | return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode(); |
| 3306 | } |
| 3307 | |
| 3308 | bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { |
| 3309 | if (N->getOpcode() == ISD::TRUNCATE) |
| 3310 | N = N->getOperand(Num: 0).getNode(); |
| 3311 | if (N->getOpcode() != X86ISD::Wrapper) |
| 3312 | return false; |
| 3313 | |
| 3314 | auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0)); |
| 3315 | if (!GA) |
| 3316 | return false; |
| 3317 | |
| 3318 | auto *GV = GA->getGlobal(); |
| 3319 | std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange(); |
| 3320 | if (CR) |
| 3321 | return CR->getSignedMin().sge(RHS: -1ull << Width) && |
| 3322 | CR->getSignedMax().slt(RHS: 1ull << Width); |
| 3323 | // In the kernel code model, globals are in the negative 2GB of the address |
| 3324 | // space, so globals can be a sign extended 32-bit immediate. |
| 3325 | // In other code models, small globals are in the low 2GB of the address |
| 3326 | // space, so sign extending them is equivalent to zero extending them. |
| 3327 | return Width == 32 && !TM.isLargeGlobalValue(GV); |
| 3328 | } |
| 3329 | |
| 3330 | X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const { |
| 3331 | assert(N->isMachineOpcode() && "Unexpected node" ); |
| 3332 | unsigned Opc = N->getMachineOpcode(); |
| 3333 | const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc); |
| 3334 | int CondNo = X86::getCondSrcNoFromDesc(MCID); |
| 3335 | if (CondNo < 0) |
| 3336 | return X86::COND_INVALID; |
| 3337 | |
| 3338 | return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo)); |
| 3339 | } |
| 3340 | |
| 3341 | /// Test whether the given X86ISD::CMP node has any users that use a flag |
| 3342 | /// other than ZF. |
| 3343 | bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { |
| 3344 | // Examine each user of the node. |
| 3345 | for (SDUse &Use : Flags->uses()) { |
| 3346 | // Only check things that use the flags. |
| 3347 | if (Use.getResNo() != Flags.getResNo()) |
| 3348 | continue; |
| 3349 | SDNode *User = Use.getUser(); |
| 3350 | // Only examine CopyToReg uses that copy to EFLAGS. |
| 3351 | if (User->getOpcode() != ISD::CopyToReg || |
| 3352 | cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
| 3353 | return false; |
| 3354 | // Examine each user of the CopyToReg use. |
| 3355 | for (SDUse &FlagUse : User->uses()) { |
| 3356 | // Only examine the Flag result. |
| 3357 | if (FlagUse.getResNo() != 1) |
| 3358 | continue; |
| 3359 | // Anything unusual: assume conservatively. |
| 3360 | if (!FlagUse.getUser()->isMachineOpcode()) |
| 3361 | return false; |
| 3362 | // Examine the condition code of the user. |
| 3363 | X86::CondCode CC = getCondFromNode(N: FlagUse.getUser()); |
| 3364 | |
| 3365 | switch (CC) { |
| 3366 | // Comparisons which only use the zero flag. |
| 3367 | case X86::COND_E: case X86::COND_NE: |
| 3368 | continue; |
| 3369 | // Anything else: assume conservatively. |
| 3370 | default: |
| 3371 | return false; |
| 3372 | } |
| 3373 | } |
| 3374 | } |
| 3375 | return true; |
| 3376 | } |
| 3377 | |
| 3378 | /// Test whether the given X86ISD::CMP node has any uses which require the SF |
| 3379 | /// flag to be accurate. |
| 3380 | bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { |
| 3381 | // Examine each user of the node. |
| 3382 | for (SDUse &Use : Flags->uses()) { |
| 3383 | // Only check things that use the flags. |
| 3384 | if (Use.getResNo() != Flags.getResNo()) |
| 3385 | continue; |
| 3386 | SDNode *User = Use.getUser(); |
| 3387 | // Only examine CopyToReg uses that copy to EFLAGS. |
| 3388 | if (User->getOpcode() != ISD::CopyToReg || |
| 3389 | cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
| 3390 | return false; |
| 3391 | // Examine each user of the CopyToReg use. |
| 3392 | for (SDUse &FlagUse : User->uses()) { |
| 3393 | // Only examine the Flag result. |
| 3394 | if (FlagUse.getResNo() != 1) |
| 3395 | continue; |
| 3396 | // Anything unusual: assume conservatively. |
| 3397 | if (!FlagUse.getUser()->isMachineOpcode()) |
| 3398 | return false; |
| 3399 | // Examine the condition code of the user. |
| 3400 | X86::CondCode CC = getCondFromNode(N: FlagUse.getUser()); |
| 3401 | |
| 3402 | switch (CC) { |
| 3403 | // Comparisons which don't examine the SF flag. |
| 3404 | case X86::COND_A: case X86::COND_AE: |
| 3405 | case X86::COND_B: case X86::COND_BE: |
| 3406 | case X86::COND_E: case X86::COND_NE: |
| 3407 | case X86::COND_O: case X86::COND_NO: |
| 3408 | case X86::COND_P: case X86::COND_NP: |
| 3409 | continue; |
| 3410 | // Anything else: assume conservatively. |
| 3411 | default: |
| 3412 | return false; |
| 3413 | } |
| 3414 | } |
| 3415 | } |
| 3416 | return true; |
| 3417 | } |
| 3418 | |
| 3419 | static bool mayUseCarryFlag(X86::CondCode CC) { |
| 3420 | switch (CC) { |
| 3421 | // Comparisons which don't examine the CF flag. |
| 3422 | case X86::COND_O: case X86::COND_NO: |
| 3423 | case X86::COND_E: case X86::COND_NE: |
| 3424 | case X86::COND_S: case X86::COND_NS: |
| 3425 | case X86::COND_P: case X86::COND_NP: |
| 3426 | case X86::COND_L: case X86::COND_GE: |
| 3427 | case X86::COND_G: case X86::COND_LE: |
| 3428 | return false; |
| 3429 | // Anything else: assume conservatively. |
| 3430 | default: |
| 3431 | return true; |
| 3432 | } |
| 3433 | } |
| 3434 | |
| 3435 | /// Test whether the given node which sets flags has any uses which require the |
| 3436 | /// CF flag to be accurate. |
| 3437 | bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { |
| 3438 | // Examine each user of the node. |
| 3439 | for (SDUse &Use : Flags->uses()) { |
| 3440 | // Only check things that use the flags. |
| 3441 | if (Use.getResNo() != Flags.getResNo()) |
| 3442 | continue; |
| 3443 | |
| 3444 | SDNode *User = Use.getUser(); |
| 3445 | unsigned UserOpc = User->getOpcode(); |
| 3446 | |
| 3447 | if (UserOpc == ISD::CopyToReg) { |
| 3448 | // Only examine CopyToReg uses that copy to EFLAGS. |
| 3449 | if (cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
| 3450 | return false; |
| 3451 | // Examine each user of the CopyToReg use. |
| 3452 | for (SDUse &FlagUse : User->uses()) { |
| 3453 | // Only examine the Flag result. |
| 3454 | if (FlagUse.getResNo() != 1) |
| 3455 | continue; |
| 3456 | // Anything unusual: assume conservatively. |
| 3457 | if (!FlagUse.getUser()->isMachineOpcode()) |
| 3458 | return false; |
| 3459 | // Examine the condition code of the user. |
| 3460 | X86::CondCode CC = getCondFromNode(N: FlagUse.getUser()); |
| 3461 | |
| 3462 | if (mayUseCarryFlag(CC)) |
| 3463 | return false; |
| 3464 | } |
| 3465 | |
| 3466 | // This CopyToReg is ok. Move on to the next user. |
| 3467 | continue; |
| 3468 | } |
| 3469 | |
| 3470 | // This might be an unselected node. So look for the pre-isel opcodes that |
| 3471 | // use flags. |
| 3472 | unsigned CCOpNo; |
| 3473 | switch (UserOpc) { |
| 3474 | default: |
| 3475 | // Something unusual. Be conservative. |
| 3476 | return false; |
| 3477 | case X86ISD::SETCC: CCOpNo = 0; break; |
| 3478 | case X86ISD::SETCC_CARRY: CCOpNo = 0; break; |
| 3479 | case X86ISD::CMOV: CCOpNo = 2; break; |
| 3480 | case X86ISD::BRCOND: CCOpNo = 2; break; |
| 3481 | } |
| 3482 | |
| 3483 | X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo); |
| 3484 | if (mayUseCarryFlag(CC)) |
| 3485 | return false; |
| 3486 | } |
| 3487 | return true; |
| 3488 | } |
| 3489 | |
| 3490 | /// Check whether or not the chain ending in StoreNode is suitable for doing |
| 3491 | /// the {load; op; store} to modify transformation. |
| 3492 | static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, |
| 3493 | SDValue StoredVal, SelectionDAG *CurDAG, |
| 3494 | unsigned LoadOpNo, |
| 3495 | LoadSDNode *&LoadNode, |
| 3496 | SDValue &InputChain) { |
| 3497 | // Is the stored value result 0 of the operation? |
| 3498 | if (StoredVal.getResNo() != 0) return false; |
| 3499 | |
| 3500 | // Are there other uses of the operation other than the store? |
| 3501 | if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false; |
| 3502 | |
| 3503 | // Is the store non-extending and non-indexed? |
| 3504 | if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal()) |
| 3505 | return false; |
| 3506 | |
| 3507 | SDValue Load = StoredVal->getOperand(Num: LoadOpNo); |
| 3508 | // Is the stored value a non-extending and non-indexed load? |
| 3509 | if (!ISD::isNormalLoad(N: Load.getNode())) return false; |
| 3510 | |
| 3511 | // Return LoadNode by reference. |
| 3512 | LoadNode = cast<LoadSDNode>(Val&: Load); |
| 3513 | |
| 3514 | // Is store the only read of the loaded value? |
| 3515 | if (!Load.hasOneUse()) |
| 3516 | return false; |
| 3517 | |
| 3518 | // Is the address of the store the same as the load? |
| 3519 | if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || |
| 3520 | LoadNode->getOffset() != StoreNode->getOffset()) |
| 3521 | return false; |
| 3522 | |
| 3523 | bool FoundLoad = false; |
| 3524 | SmallVector<SDValue, 4> ChainOps; |
| 3525 | SmallVector<const SDNode *, 4> LoopWorklist; |
| 3526 | SmallPtrSet<const SDNode *, 16> Visited; |
| 3527 | const unsigned int Max = 1024; |
| 3528 | |
| 3529 | // Visualization of Load-Op-Store fusion: |
| 3530 | // ------------------------- |
| 3531 | // Legend: |
| 3532 | // *-lines = Chain operand dependencies. |
| 3533 | // |-lines = Normal operand dependencies. |
| 3534 | // Dependencies flow down and right. n-suffix references multiple nodes. |
| 3535 | // |
| 3536 | // C Xn C |
| 3537 | // * * * |
| 3538 | // * * * |
| 3539 | // Xn A-LD Yn TF Yn |
| 3540 | // * * \ | * | |
| 3541 | // * * \ | * | |
| 3542 | // * * \ | => A--LD_OP_ST |
| 3543 | // * * \| \ |
| 3544 | // TF OP \ |
| 3545 | // * | \ Zn |
| 3546 | // * | \ |
| 3547 | // A-ST Zn |
| 3548 | // |
| 3549 | |
| 3550 | // This merge induced dependences from: #1: Xn -> LD, OP, Zn |
| 3551 | // #2: Yn -> LD |
| 3552 | // #3: ST -> Zn |
| 3553 | |
| 3554 | // Ensure the transform is safe by checking for the dual |
| 3555 | // dependencies to make sure we do not induce a loop. |
| 3556 | |
| 3557 | // As LD is a predecessor to both OP and ST we can do this by checking: |
| 3558 | // a). if LD is a predecessor to a member of Xn or Yn. |
| 3559 | // b). if a Zn is a predecessor to ST. |
| 3560 | |
| 3561 | // However, (b) can only occur through being a chain predecessor to |
| 3562 | // ST, which is the same as Zn being a member or predecessor of Xn, |
| 3563 | // which is a subset of LD being a predecessor of Xn. So it's |
| 3564 | // subsumed by check (a). |
| 3565 | |
| 3566 | SDValue Chain = StoreNode->getChain(); |
| 3567 | |
| 3568 | // Gather X elements in ChainOps. |
| 3569 | if (Chain == Load.getValue(R: 1)) { |
| 3570 | FoundLoad = true; |
| 3571 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
| 3572 | } else if (Chain.getOpcode() == ISD::TokenFactor) { |
| 3573 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { |
| 3574 | SDValue Op = Chain.getOperand(i); |
| 3575 | if (Op == Load.getValue(R: 1)) { |
| 3576 | FoundLoad = true; |
| 3577 | // Drop Load, but keep its chain. No cycle check necessary. |
| 3578 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
| 3579 | continue; |
| 3580 | } |
| 3581 | LoopWorklist.push_back(Elt: Op.getNode()); |
| 3582 | ChainOps.push_back(Elt: Op); |
| 3583 | } |
| 3584 | } |
| 3585 | |
| 3586 | if (!FoundLoad) |
| 3587 | return false; |
| 3588 | |
| 3589 | // Worklist is currently Xn. Add Yn to worklist. |
| 3590 | for (SDValue Op : StoredVal->ops()) |
| 3591 | if (Op.getNode() != LoadNode) |
| 3592 | LoopWorklist.push_back(Elt: Op.getNode()); |
| 3593 | |
| 3594 | // Check (a) if Load is a predecessor to Xn + Yn |
| 3595 | if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max, |
| 3596 | TopologicalPrune: true)) |
| 3597 | return false; |
| 3598 | |
| 3599 | InputChain = |
| 3600 | CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps); |
| 3601 | return true; |
| 3602 | } |
| 3603 | |
| 3604 | // Change a chain of {load; op; store} of the same value into a simple op |
| 3605 | // through memory of that value, if the uses of the modified value and its |
| 3606 | // address are suitable. |
| 3607 | // |
| 3608 | // The tablegen pattern memory operand pattern is currently not able to match |
| 3609 | // the case where the EFLAGS on the original operation are used. |
| 3610 | // |
| 3611 | // To move this to tablegen, we'll need to improve tablegen to allow flags to |
| 3612 | // be transferred from a node in the pattern to the result node, probably with |
| 3613 | // a new keyword. For example, we have this |
| 3614 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
| 3615 | // [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>; |
| 3616 | // but maybe need something like this |
| 3617 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
| 3618 | // [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst), |
| 3619 | // (transferrable EFLAGS)]>; |
| 3620 | // |
| 3621 | // Until then, we manually fold these and instruction select the operation |
| 3622 | // here. |
| 3623 | bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { |
| 3624 | auto *StoreNode = cast<StoreSDNode>(Val: Node); |
| 3625 | SDValue StoredVal = StoreNode->getOperand(Num: 1); |
| 3626 | unsigned Opc = StoredVal->getOpcode(); |
| 3627 | |
| 3628 | // Before we try to select anything, make sure this is memory operand size |
| 3629 | // and opcode we can handle. Note that this must match the code below that |
| 3630 | // actually lowers the opcodes. |
| 3631 | EVT MemVT = StoreNode->getMemoryVT(); |
| 3632 | if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && |
| 3633 | MemVT != MVT::i8) |
| 3634 | return false; |
| 3635 | |
| 3636 | bool IsCommutable = false; |
| 3637 | bool IsNegate = false; |
| 3638 | switch (Opc) { |
| 3639 | default: |
| 3640 | return false; |
| 3641 | case X86ISD::SUB: |
| 3642 | IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0)); |
| 3643 | break; |
| 3644 | case X86ISD::SBB: |
| 3645 | break; |
| 3646 | case X86ISD::ADD: |
| 3647 | case X86ISD::ADC: |
| 3648 | case X86ISD::AND: |
| 3649 | case X86ISD::OR: |
| 3650 | case X86ISD::XOR: |
| 3651 | IsCommutable = true; |
| 3652 | break; |
| 3653 | } |
| 3654 | |
| 3655 | unsigned LoadOpNo = IsNegate ? 1 : 0; |
| 3656 | LoadSDNode *LoadNode = nullptr; |
| 3657 | SDValue InputChain; |
| 3658 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
| 3659 | LoadNode, InputChain)) { |
| 3660 | if (!IsCommutable) |
| 3661 | return false; |
| 3662 | |
| 3663 | // This operation is commutable, try the other operand. |
| 3664 | LoadOpNo = 1; |
| 3665 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
| 3666 | LoadNode, InputChain)) |
| 3667 | return false; |
| 3668 | } |
| 3669 | |
| 3670 | SDValue Base, Scale, Index, Disp, Segment; |
| 3671 | if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp, |
| 3672 | Segment)) |
| 3673 | return false; |
| 3674 | |
| 3675 | auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, |
| 3676 | unsigned Opc8) { |
| 3677 | switch (MemVT.getSimpleVT().SimpleTy) { |
| 3678 | case MVT::i64: |
| 3679 | return Opc64; |
| 3680 | case MVT::i32: |
| 3681 | return Opc32; |
| 3682 | case MVT::i16: |
| 3683 | return Opc16; |
| 3684 | case MVT::i8: |
| 3685 | return Opc8; |
| 3686 | default: |
| 3687 | llvm_unreachable("Invalid size!" ); |
| 3688 | } |
| 3689 | }; |
| 3690 | |
| 3691 | MachineSDNode *Result; |
| 3692 | switch (Opc) { |
| 3693 | case X86ISD::SUB: |
| 3694 | // Handle negate. |
| 3695 | if (IsNegate) { |
| 3696 | unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, |
| 3697 | X86::NEG8m); |
| 3698 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
| 3699 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, |
| 3700 | VT2: MVT::Other, Ops); |
| 3701 | break; |
| 3702 | } |
| 3703 | [[fallthrough]]; |
| 3704 | case X86ISD::ADD: |
| 3705 | // Try to match inc/dec. |
| 3706 | if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { |
| 3707 | bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1)); |
| 3708 | bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1)); |
| 3709 | // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. |
| 3710 | if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
| 3711 | unsigned NewOpc = |
| 3712 | ((Opc == X86ISD::ADD) == IsOne) |
| 3713 | ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) |
| 3714 | : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); |
| 3715 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
| 3716 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, |
| 3717 | VT2: MVT::Other, Ops); |
| 3718 | break; |
| 3719 | } |
| 3720 | } |
| 3721 | [[fallthrough]]; |
| 3722 | case X86ISD::ADC: |
| 3723 | case X86ISD::SBB: |
| 3724 | case X86ISD::AND: |
| 3725 | case X86ISD::OR: |
| 3726 | case X86ISD::XOR: { |
| 3727 | auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { |
| 3728 | switch (Opc) { |
| 3729 | case X86ISD::ADD: |
| 3730 | return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, |
| 3731 | X86::ADD8mr); |
| 3732 | case X86ISD::ADC: |
| 3733 | return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, |
| 3734 | X86::ADC8mr); |
| 3735 | case X86ISD::SUB: |
| 3736 | return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, |
| 3737 | X86::SUB8mr); |
| 3738 | case X86ISD::SBB: |
| 3739 | return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, |
| 3740 | X86::SBB8mr); |
| 3741 | case X86ISD::AND: |
| 3742 | return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, |
| 3743 | X86::AND8mr); |
| 3744 | case X86ISD::OR: |
| 3745 | return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); |
| 3746 | case X86ISD::XOR: |
| 3747 | return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, |
| 3748 | X86::XOR8mr); |
| 3749 | default: |
| 3750 | llvm_unreachable("Invalid opcode!" ); |
| 3751 | } |
| 3752 | }; |
| 3753 | auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { |
| 3754 | switch (Opc) { |
| 3755 | case X86ISD::ADD: |
| 3756 | return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, |
| 3757 | X86::ADD8mi); |
| 3758 | case X86ISD::ADC: |
| 3759 | return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, |
| 3760 | X86::ADC8mi); |
| 3761 | case X86ISD::SUB: |
| 3762 | return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, |
| 3763 | X86::SUB8mi); |
| 3764 | case X86ISD::SBB: |
| 3765 | return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, |
| 3766 | X86::SBB8mi); |
| 3767 | case X86ISD::AND: |
| 3768 | return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, |
| 3769 | X86::AND8mi); |
| 3770 | case X86ISD::OR: |
| 3771 | return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, |
| 3772 | X86::OR8mi); |
| 3773 | case X86ISD::XOR: |
| 3774 | return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, |
| 3775 | X86::XOR8mi); |
| 3776 | default: |
| 3777 | llvm_unreachable("Invalid opcode!" ); |
| 3778 | } |
| 3779 | }; |
| 3780 | |
| 3781 | unsigned NewOpc = SelectRegOpcode(Opc); |
| 3782 | SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo); |
| 3783 | |
| 3784 | // See if the operand is a constant that we can fold into an immediate |
| 3785 | // operand. |
| 3786 | if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) { |
| 3787 | int64_t OperandV = OperandC->getSExtValue(); |
| 3788 | |
| 3789 | // Check if we can shrink the operand enough to fit in an immediate (or |
| 3790 | // fit into a smaller immediate) by negating it and switching the |
| 3791 | // operation. |
| 3792 | if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && |
| 3793 | ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) || |
| 3794 | (MemVT == MVT::i64 && !isInt<32>(x: OperandV) && |
| 3795 | isInt<32>(x: -OperandV))) && |
| 3796 | hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
| 3797 | OperandV = -OperandV; |
| 3798 | Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; |
| 3799 | } |
| 3800 | |
| 3801 | if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) { |
| 3802 | Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT); |
| 3803 | NewOpc = SelectImmOpcode(Opc); |
| 3804 | } |
| 3805 | } |
| 3806 | |
| 3807 | if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { |
| 3808 | SDValue CopyTo = |
| 3809 | CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS, |
| 3810 | N: StoredVal.getOperand(i: 2), Glue: SDValue()); |
| 3811 | |
| 3812 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
| 3813 | Segment, Operand, CopyTo, CopyTo.getValue(R: 1)}; |
| 3814 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other, |
| 3815 | Ops); |
| 3816 | } else { |
| 3817 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
| 3818 | Segment, Operand, InputChain}; |
| 3819 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other, |
| 3820 | Ops); |
| 3821 | } |
| 3822 | break; |
| 3823 | } |
| 3824 | default: |
| 3825 | llvm_unreachable("Invalid opcode!" ); |
| 3826 | } |
| 3827 | |
| 3828 | MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), |
| 3829 | LoadNode->getMemOperand()}; |
| 3830 | CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps); |
| 3831 | |
| 3832 | // Update Load Chain uses as well. |
| 3833 | ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1)); |
| 3834 | ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1)); |
| 3835 | ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0)); |
| 3836 | CurDAG->RemoveDeadNode(N: Node); |
| 3837 | return true; |
| 3838 | } |
| 3839 | |
| 3840 | // See if this is an X & Mask that we can match to BEXTR/BZHI. |
| 3841 | // Where Mask is one of the following patterns: |
| 3842 | // a) x & (1 << nbits) - 1 |
| 3843 | // b) x & ~(-1 << nbits) |
| 3844 | // c) x & (-1 >> (32 - y)) |
| 3845 | // d) x << (32 - y) >> (32 - y) |
| 3846 | // e) (1 << nbits) - 1 |
| 3847 | bool X86DAGToDAGISel::(SDNode *Node) { |
| 3848 | assert( |
| 3849 | (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND || |
| 3850 | Node->getOpcode() == ISD::SRL) && |
| 3851 | "Should be either an and-mask, or right-shift after clearing high bits." ); |
| 3852 | |
| 3853 | // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. |
| 3854 | if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) |
| 3855 | return false; |
| 3856 | |
| 3857 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
| 3858 | |
| 3859 | // Only supported for 32 and 64 bits. |
| 3860 | if (NVT != MVT::i32 && NVT != MVT::i64) |
| 3861 | return false; |
| 3862 | |
| 3863 | SDValue NBits; |
| 3864 | bool NegateNBits; |
| 3865 | |
| 3866 | // If we have BMI2's BZHI, we are ok with muti-use patterns. |
| 3867 | // Else, if we only have BMI1's BEXTR, we require one-use. |
| 3868 | const bool = Subtarget->hasBMI2(); |
| 3869 | auto checkUses = [AllowExtraUsesByDefault]( |
| 3870 | SDValue Op, unsigned NUses, |
| 3871 | std::optional<bool> ) { |
| 3872 | return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) || |
| 3873 | Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo()); |
| 3874 | }; |
| 3875 | auto checkOneUse = [checkUses](SDValue Op, |
| 3876 | std::optional<bool> = |
| 3877 | std::nullopt) { |
| 3878 | return checkUses(Op, 1, AllowExtraUses); |
| 3879 | }; |
| 3880 | auto checkTwoUse = [checkUses](SDValue Op, |
| 3881 | std::optional<bool> = |
| 3882 | std::nullopt) { |
| 3883 | return checkUses(Op, 2, AllowExtraUses); |
| 3884 | }; |
| 3885 | |
| 3886 | auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { |
| 3887 | if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { |
| 3888 | assert(V.getSimpleValueType() == MVT::i32 && |
| 3889 | V.getOperand(0).getSimpleValueType() == MVT::i64 && |
| 3890 | "Expected i64 -> i32 truncation" ); |
| 3891 | V = V.getOperand(i: 0); |
| 3892 | } |
| 3893 | return V; |
| 3894 | }; |
| 3895 | |
| 3896 | // a) x & ((1 << nbits) + (-1)) |
| 3897 | auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits, |
| 3898 | &NegateNBits](SDValue Mask) -> bool { |
| 3899 | // Match `add`. Must only have one use! |
| 3900 | if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) |
| 3901 | return false; |
| 3902 | // We should be adding all-ones constant (i.e. subtracting one.) |
| 3903 | if (!isAllOnesConstant(V: Mask->getOperand(Num: 1))) |
| 3904 | return false; |
| 3905 | // Match `1 << nbits`. Might be truncated. Must only have one use! |
| 3906 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
| 3907 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
| 3908 | return false; |
| 3909 | if (!isOneConstant(V: M0->getOperand(Num: 0))) |
| 3910 | return false; |
| 3911 | NBits = M0->getOperand(Num: 1); |
| 3912 | NegateNBits = false; |
| 3913 | return true; |
| 3914 | }; |
| 3915 | |
| 3916 | auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { |
| 3917 | V = peekThroughOneUseTruncation(V); |
| 3918 | return CurDAG->MaskedValueIsAllOnes( |
| 3919 | Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(), |
| 3920 | loBitsSet: NVT.getSizeInBits())); |
| 3921 | }; |
| 3922 | |
| 3923 | // b) x & ~(-1 << nbits) |
| 3924 | auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, |
| 3925 | &NBits, &NegateNBits](SDValue Mask) -> bool { |
| 3926 | // Match `~()`. Must only have one use! |
| 3927 | if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) |
| 3928 | return false; |
| 3929 | // The -1 only has to be all-ones for the final Node's NVT. |
| 3930 | if (!isAllOnes(Mask->getOperand(Num: 1))) |
| 3931 | return false; |
| 3932 | // Match `-1 << nbits`. Might be truncated. Must only have one use! |
| 3933 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
| 3934 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
| 3935 | return false; |
| 3936 | // The -1 only has to be all-ones for the final Node's NVT. |
| 3937 | if (!isAllOnes(M0->getOperand(Num: 0))) |
| 3938 | return false; |
| 3939 | NBits = M0->getOperand(Num: 1); |
| 3940 | NegateNBits = false; |
| 3941 | return true; |
| 3942 | }; |
| 3943 | |
| 3944 | // Try to match potentially-truncated shift amount as `(bitwidth - y)`, |
| 3945 | // or leave the shift amount as-is, but then we'll have to negate it. |
| 3946 | auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt, |
| 3947 | unsigned Bitwidth) { |
| 3948 | NBits = ShiftAmt; |
| 3949 | NegateNBits = true; |
| 3950 | // Skip over a truncate of the shift amount, if any. |
| 3951 | if (NBits.getOpcode() == ISD::TRUNCATE) |
| 3952 | NBits = NBits.getOperand(i: 0); |
| 3953 | // Try to match the shift amount as (bitwidth - y). It should go away, too. |
| 3954 | // If it doesn't match, that's fine, we'll just negate it ourselves. |
| 3955 | if (NBits.getOpcode() != ISD::SUB) |
| 3956 | return; |
| 3957 | auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0)); |
| 3958 | if (!V0 || V0->getZExtValue() != Bitwidth) |
| 3959 | return; |
| 3960 | NBits = NBits.getOperand(i: 1); |
| 3961 | NegateNBits = false; |
| 3962 | }; |
| 3963 | |
| 3964 | // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth |
| 3965 | // or |
| 3966 | // c) x & (-1 >> (32 - y)) |
| 3967 | auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits, |
| 3968 | canonicalizeShiftAmt](SDValue Mask) -> bool { |
| 3969 | // The mask itself may be truncated. |
| 3970 | Mask = peekThroughOneUseTruncation(Mask); |
| 3971 | unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); |
| 3972 | // Match `l>>`. Must only have one use! |
| 3973 | if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) |
| 3974 | return false; |
| 3975 | // We should be shifting truly all-ones constant. |
| 3976 | if (!isAllOnesConstant(V: Mask.getOperand(i: 0))) |
| 3977 | return false; |
| 3978 | SDValue M1 = Mask.getOperand(i: 1); |
| 3979 | // The shift amount should not be used externally. |
| 3980 | if (!checkOneUse(M1)) |
| 3981 | return false; |
| 3982 | canonicalizeShiftAmt(M1, Bitwidth); |
| 3983 | // Pattern c. is non-canonical, and is expanded into pattern d. iff there |
| 3984 | // is no extra use of the mask. Clearly, there was one since we are here. |
| 3985 | // But at the same time, if we need to negate the shift amount, |
| 3986 | // then we don't want the mask to stick around, else it's unprofitable. |
| 3987 | return !NegateNBits; |
| 3988 | }; |
| 3989 | |
| 3990 | SDValue X; |
| 3991 | |
| 3992 | // d) x << z >> z but then we'll have to subtract z from bitwidth |
| 3993 | // or |
| 3994 | // d) x << (32 - y) >> (32 - y) |
| 3995 | auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt, |
| 3996 | AllowExtraUsesByDefault, &NegateNBits, |
| 3997 | &X](SDNode *Node) -> bool { |
| 3998 | if (Node->getOpcode() != ISD::SRL) |
| 3999 | return false; |
| 4000 | SDValue N0 = Node->getOperand(Num: 0); |
| 4001 | if (N0->getOpcode() != ISD::SHL) |
| 4002 | return false; |
| 4003 | unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); |
| 4004 | SDValue N1 = Node->getOperand(Num: 1); |
| 4005 | SDValue N01 = N0->getOperand(Num: 1); |
| 4006 | // Both of the shifts must be by the exact same value. |
| 4007 | if (N1 != N01) |
| 4008 | return false; |
| 4009 | canonicalizeShiftAmt(N1, Bitwidth); |
| 4010 | // There should not be any external uses of the inner shift / shift amount. |
| 4011 | // Note that while we are generally okay with external uses given BMI2, |
| 4012 | // iff we need to negate the shift amount, we are not okay with extra uses. |
| 4013 | const bool = AllowExtraUsesByDefault && !NegateNBits; |
| 4014 | if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses)) |
| 4015 | return false; |
| 4016 | X = N0->getOperand(Num: 0); |
| 4017 | return true; |
| 4018 | }; |
| 4019 | |
| 4020 | auto matchLowBitMask = [matchPatternA, matchPatternB, |
| 4021 | matchPatternC](SDValue Mask) -> bool { |
| 4022 | return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); |
| 4023 | }; |
| 4024 | |
| 4025 | if (Node->getOpcode() == ISD::AND) { |
| 4026 | X = Node->getOperand(Num: 0); |
| 4027 | SDValue Mask = Node->getOperand(Num: 1); |
| 4028 | |
| 4029 | if (matchLowBitMask(Mask)) { |
| 4030 | // Great. |
| 4031 | } else { |
| 4032 | std::swap(a&: X, b&: Mask); |
| 4033 | if (!matchLowBitMask(Mask)) |
| 4034 | return false; |
| 4035 | } |
| 4036 | } else if (matchLowBitMask(SDValue(Node, 0))) { |
| 4037 | X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT); |
| 4038 | } else if (!matchPatternD(Node)) |
| 4039 | return false; |
| 4040 | |
| 4041 | // If we need to negate the shift amount, require BMI2 BZHI support. |
| 4042 | // It's just too unprofitable for BMI1 BEXTR. |
| 4043 | if (NegateNBits && !Subtarget->hasBMI2()) |
| 4044 | return false; |
| 4045 | |
| 4046 | SDLoc DL(Node); |
| 4047 | |
| 4048 | // Truncate the shift amount. |
| 4049 | NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits); |
| 4050 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
| 4051 | |
| 4052 | // Insert 8-bit NBits into lowest 8 bits of 32-bit register. |
| 4053 | // All the other bits are undefined, we do not care about them. |
| 4054 | SDValue ImplDef = SDValue( |
| 4055 | CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0); |
| 4056 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef); |
| 4057 | |
| 4058 | SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32); |
| 4059 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal); |
| 4060 | NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL, |
| 4061 | VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal), |
| 4062 | 0); |
| 4063 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
| 4064 | |
| 4065 | // We might have matched the amount of high bits to be cleared, |
| 4066 | // but we want the amount of low bits to be kept, so negate it then. |
| 4067 | if (NegateNBits) { |
| 4068 | SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32); |
| 4069 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC); |
| 4070 | |
| 4071 | NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits); |
| 4072 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
| 4073 | } |
| 4074 | |
| 4075 | if (Subtarget->hasBMI2()) { |
| 4076 | // Great, just emit the BZHI.. |
| 4077 | if (NVT != MVT::i32) { |
| 4078 | // But have to place the bit count into the wide-enough register first. |
| 4079 | NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits); |
| 4080 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
| 4081 | } |
| 4082 | |
| 4083 | SDValue = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits); |
| 4084 | ReplaceNode(F: Node, T: Extract.getNode()); |
| 4085 | SelectCode(N: Extract.getNode()); |
| 4086 | return true; |
| 4087 | } |
| 4088 | |
| 4089 | // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is |
| 4090 | // *logically* shifted (potentially with one-use trunc inbetween), |
| 4091 | // and the truncation was the only use of the shift, |
| 4092 | // and if so look past one-use truncation. |
| 4093 | { |
| 4094 | SDValue RealX = peekThroughOneUseTruncation(X); |
| 4095 | // FIXME: only if the shift is one-use? |
| 4096 | if (RealX != X && RealX.getOpcode() == ISD::SRL) |
| 4097 | X = RealX; |
| 4098 | } |
| 4099 | |
| 4100 | MVT XVT = X.getSimpleValueType(); |
| 4101 | |
| 4102 | // Else, emitting BEXTR requires one more step. |
| 4103 | // The 'control' of BEXTR has the pattern of: |
| 4104 | // [15...8 bit][ 7...0 bit] location |
| 4105 | // [ bit count][ shift] name |
| 4106 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
| 4107 | |
| 4108 | // Shift NBits left by 8 bits, thus producing 'control'. |
| 4109 | // This makes the low 8 bits to be zero. |
| 4110 | SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8); |
| 4111 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8); |
| 4112 | SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8); |
| 4113 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
| 4114 | |
| 4115 | // If the 'X' is *logically* shifted, we can fold that shift into 'control'. |
| 4116 | // FIXME: only if the shift is one-use? |
| 4117 | if (X.getOpcode() == ISD::SRL) { |
| 4118 | SDValue ShiftAmt = X.getOperand(i: 1); |
| 4119 | X = X.getOperand(i: 0); |
| 4120 | |
| 4121 | assert(ShiftAmt.getValueType() == MVT::i8 && |
| 4122 | "Expected shift amount to be i8" ); |
| 4123 | |
| 4124 | // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! |
| 4125 | // We could zext to i16 in some form, but we intentionally don't do that. |
| 4126 | SDValue OrigShiftAmt = ShiftAmt; |
| 4127 | ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt); |
| 4128 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt); |
| 4129 | |
| 4130 | // And now 'or' these low 8 bits of shift amount into the 'control'. |
| 4131 | Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt); |
| 4132 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
| 4133 | } |
| 4134 | |
| 4135 | // But have to place the 'control' into the wide-enough register first. |
| 4136 | if (XVT != MVT::i32) { |
| 4137 | Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control); |
| 4138 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
| 4139 | } |
| 4140 | |
| 4141 | // And finally, form the BEXTR itself. |
| 4142 | SDValue = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control); |
| 4143 | |
| 4144 | // The 'X' was originally truncated. Do that now. |
| 4145 | if (XVT != NVT) { |
| 4146 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract); |
| 4147 | Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract); |
| 4148 | } |
| 4149 | |
| 4150 | ReplaceNode(F: Node, T: Extract.getNode()); |
| 4151 | SelectCode(N: Extract.getNode()); |
| 4152 | |
| 4153 | return true; |
| 4154 | } |
| 4155 | |
| 4156 | // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. |
| 4157 | MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { |
| 4158 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
| 4159 | SDLoc dl(Node); |
| 4160 | |
| 4161 | SDValue N0 = Node->getOperand(Num: 0); |
| 4162 | SDValue N1 = Node->getOperand(Num: 1); |
| 4163 | |
| 4164 | // If we have TBM we can use an immediate for the control. If we have BMI |
| 4165 | // we should only do this if the BEXTR instruction is implemented well. |
| 4166 | // Otherwise moving the control into a register makes this more costly. |
| 4167 | // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM |
| 4168 | // hoisting the move immediate would make it worthwhile with a less optimal |
| 4169 | // BEXTR? |
| 4170 | bool PreferBEXTR = |
| 4171 | Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); |
| 4172 | if (!PreferBEXTR && !Subtarget->hasBMI2()) |
| 4173 | return nullptr; |
| 4174 | |
| 4175 | // Must have a shift right. |
| 4176 | if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) |
| 4177 | return nullptr; |
| 4178 | |
| 4179 | // Shift can't have additional users. |
| 4180 | if (!N0->hasOneUse()) |
| 4181 | return nullptr; |
| 4182 | |
| 4183 | // Only supported for 32 and 64 bits. |
| 4184 | if (NVT != MVT::i32 && NVT != MVT::i64) |
| 4185 | return nullptr; |
| 4186 | |
| 4187 | // Shift amount and RHS of and must be constant. |
| 4188 | auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1); |
| 4189 | auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1)); |
| 4190 | if (!MaskCst || !ShiftCst) |
| 4191 | return nullptr; |
| 4192 | |
| 4193 | // And RHS must be a mask. |
| 4194 | uint64_t Mask = MaskCst->getZExtValue(); |
| 4195 | if (!isMask_64(Value: Mask)) |
| 4196 | return nullptr; |
| 4197 | |
| 4198 | uint64_t Shift = ShiftCst->getZExtValue(); |
| 4199 | uint64_t MaskSize = llvm::popcount(Value: Mask); |
| 4200 | |
| 4201 | // Don't interfere with something that can be handled by extracting AH. |
| 4202 | // TODO: If we are able to fold a load, BEXTR might still be better than AH. |
| 4203 | if (Shift == 8 && MaskSize == 8) |
| 4204 | return nullptr; |
| 4205 | |
| 4206 | // Make sure we are only using bits that were in the original value, not |
| 4207 | // shifted in. |
| 4208 | if (Shift + MaskSize > NVT.getSizeInBits()) |
| 4209 | return nullptr; |
| 4210 | |
| 4211 | // BZHI, if available, is always fast, unlike BEXTR. But even if we decide |
| 4212 | // that we can't use BEXTR, it is only worthwhile using BZHI if the mask |
| 4213 | // does not fit into 32 bits. Load folding is not a sufficient reason. |
| 4214 | if (!PreferBEXTR && MaskSize <= 32) |
| 4215 | return nullptr; |
| 4216 | |
| 4217 | SDValue Control; |
| 4218 | unsigned ROpc, MOpc; |
| 4219 | |
| 4220 | #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC) |
| 4221 | if (!PreferBEXTR) { |
| 4222 | assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then." ); |
| 4223 | // If we can't make use of BEXTR then we can't fuse shift+mask stages. |
| 4224 | // Let's perform the mask first, and apply shift later. Note that we need to |
| 4225 | // widen the mask to account for the fact that we'll apply shift afterwards! |
| 4226 | Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT); |
| 4227 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr) |
| 4228 | : GET_EGPR_IF_ENABLED(X86::BZHI32rr); |
| 4229 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm) |
| 4230 | : GET_EGPR_IF_ENABLED(X86::BZHI32rm); |
| 4231 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
| 4232 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
| 4233 | } else { |
| 4234 | // The 'control' of BEXTR has the pattern of: |
| 4235 | // [15...8 bit][ 7...0 bit] location |
| 4236 | // [ bit count][ shift] name |
| 4237 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
| 4238 | Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT); |
| 4239 | if (Subtarget->hasTBM()) { |
| 4240 | ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; |
| 4241 | MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; |
| 4242 | } else { |
| 4243 | assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then." ); |
| 4244 | // BMI requires the immediate to placed in a register. |
| 4245 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr) |
| 4246 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rr); |
| 4247 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm) |
| 4248 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rm); |
| 4249 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
| 4250 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
| 4251 | } |
| 4252 | } |
| 4253 | |
| 4254 | MachineSDNode *NewNode; |
| 4255 | SDValue Input = N0->getOperand(Num: 0); |
| 4256 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 4257 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
| 4258 | SDValue Ops[] = { |
| 4259 | Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)}; |
| 4260 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
| 4261 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 4262 | // Update the chain. |
| 4263 | ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2)); |
| 4264 | // Record the mem-refs |
| 4265 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()}); |
| 4266 | } else { |
| 4267 | NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control); |
| 4268 | } |
| 4269 | |
| 4270 | if (!PreferBEXTR) { |
| 4271 | // We still need to apply the shift. |
| 4272 | SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT); |
| 4273 | unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri) |
| 4274 | : GET_ND_IF_ENABLED(X86::SHR32ri); |
| 4275 | NewNode = |
| 4276 | CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt); |
| 4277 | } |
| 4278 | |
| 4279 | return NewNode; |
| 4280 | } |
| 4281 | |
| 4282 | // Emit a PCMISTR(I/M) instruction. |
| 4283 | MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, |
| 4284 | bool MayFoldLoad, const SDLoc &dl, |
| 4285 | MVT VT, SDNode *Node) { |
| 4286 | SDValue N0 = Node->getOperand(Num: 0); |
| 4287 | SDValue N1 = Node->getOperand(Num: 1); |
| 4288 | SDValue Imm = Node->getOperand(Num: 2); |
| 4289 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
| 4290 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
| 4291 | |
| 4292 | // Try to fold a load. No need to check alignment. |
| 4293 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 4294 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
| 4295 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
| 4296 | N1.getOperand(i: 0) }; |
| 4297 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other); |
| 4298 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 4299 | // Update the chain. |
| 4300 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2)); |
| 4301 | // Record the mem-refs |
| 4302 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
| 4303 | return CNode; |
| 4304 | } |
| 4305 | |
| 4306 | SDValue Ops[] = { N0, N1, Imm }; |
| 4307 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32); |
| 4308 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
| 4309 | return CNode; |
| 4310 | } |
| 4311 | |
| 4312 | // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need |
| 4313 | // to emit a second instruction after this one. This is needed since we have two |
| 4314 | // copyToReg nodes glued before this and we need to continue that glue through. |
| 4315 | MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, |
| 4316 | bool MayFoldLoad, const SDLoc &dl, |
| 4317 | MVT VT, SDNode *Node, |
| 4318 | SDValue &InGlue) { |
| 4319 | SDValue N0 = Node->getOperand(Num: 0); |
| 4320 | SDValue N2 = Node->getOperand(Num: 2); |
| 4321 | SDValue Imm = Node->getOperand(Num: 4); |
| 4322 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
| 4323 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
| 4324 | |
| 4325 | // Try to fold a load. No need to check alignment. |
| 4326 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 4327 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
| 4328 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
| 4329 | N2.getOperand(i: 0), InGlue }; |
| 4330 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue); |
| 4331 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 4332 | InGlue = SDValue(CNode, 3); |
| 4333 | // Update the chain. |
| 4334 | ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2)); |
| 4335 | // Record the mem-refs |
| 4336 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()}); |
| 4337 | return CNode; |
| 4338 | } |
| 4339 | |
| 4340 | SDValue Ops[] = { N0, N2, Imm, InGlue }; |
| 4341 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue); |
| 4342 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
| 4343 | InGlue = SDValue(CNode, 2); |
| 4344 | return CNode; |
| 4345 | } |
| 4346 | |
| 4347 | bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { |
| 4348 | EVT VT = N->getValueType(ResNo: 0); |
| 4349 | |
| 4350 | // Only handle scalar shifts. |
| 4351 | if (VT.isVector()) |
| 4352 | return false; |
| 4353 | |
| 4354 | // Narrower shifts only mask to 5 bits in hardware. |
| 4355 | unsigned Size = VT == MVT::i64 ? 64 : 32; |
| 4356 | |
| 4357 | SDValue OrigShiftAmt = N->getOperand(Num: 1); |
| 4358 | SDValue ShiftAmt = OrigShiftAmt; |
| 4359 | SDLoc DL(N); |
| 4360 | |
| 4361 | // Skip over a truncate of the shift amount. |
| 4362 | if (ShiftAmt->getOpcode() == ISD::TRUNCATE) |
| 4363 | ShiftAmt = ShiftAmt->getOperand(Num: 0); |
| 4364 | |
| 4365 | // This function is called after X86DAGToDAGISel::matchBitExtract(), |
| 4366 | // so we are not afraid that we might mess up BZHI/BEXTR pattern. |
| 4367 | |
| 4368 | SDValue NewShiftAmt; |
| 4369 | if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB || |
| 4370 | ShiftAmt->getOpcode() == ISD::XOR) { |
| 4371 | SDValue Add0 = ShiftAmt->getOperand(Num: 0); |
| 4372 | SDValue Add1 = ShiftAmt->getOperand(Num: 1); |
| 4373 | auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0); |
| 4374 | auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1); |
| 4375 | // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X |
| 4376 | // to avoid the ADD/SUB/XOR. |
| 4377 | if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) { |
| 4378 | NewShiftAmt = Add0; |
| 4379 | |
| 4380 | } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() && |
| 4381 | ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) || |
| 4382 | (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) { |
| 4383 | // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X |
| 4384 | // we can replace it with a NOT. In the XOR case it may save some code |
| 4385 | // size, in the SUB case it also may save a move. |
| 4386 | assert(Add0C == nullptr || Add1C == nullptr); |
| 4387 | |
| 4388 | // We can only do N-X, not X-N |
| 4389 | if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr) |
| 4390 | return false; |
| 4391 | |
| 4392 | EVT OpVT = ShiftAmt.getValueType(); |
| 4393 | |
| 4394 | SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT); |
| 4395 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT, |
| 4396 | N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes); |
| 4397 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes); |
| 4398 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
| 4399 | // If we are shifting by N-X where N == 0 mod Size, then just shift by |
| 4400 | // -X to generate a NEG instead of a SUB of a constant. |
| 4401 | } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C && |
| 4402 | Add0C->getZExtValue() != 0) { |
| 4403 | EVT SubVT = ShiftAmt.getValueType(); |
| 4404 | SDValue X; |
| 4405 | if (Add0C->getZExtValue() % Size == 0) |
| 4406 | X = Add1; |
| 4407 | else if (ShiftAmt.hasOneUse() && Size == 64 && |
| 4408 | Add0C->getZExtValue() % 32 == 0) { |
| 4409 | // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32). |
| 4410 | // This is mainly beneficial if we already compute (x+n*32). |
| 4411 | if (Add1.getOpcode() == ISD::TRUNCATE) { |
| 4412 | Add1 = Add1.getOperand(i: 0); |
| 4413 | SubVT = Add1.getValueType(); |
| 4414 | } |
| 4415 | if (Add0.getValueType() != SubVT) { |
| 4416 | Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT); |
| 4417 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0); |
| 4418 | } |
| 4419 | |
| 4420 | X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0); |
| 4421 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X); |
| 4422 | } else |
| 4423 | return false; |
| 4424 | // Insert a negate op. |
| 4425 | // TODO: This isn't guaranteed to replace the sub if there is a logic cone |
| 4426 | // that uses it that's not a shift. |
| 4427 | SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT); |
| 4428 | SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X); |
| 4429 | NewShiftAmt = Neg; |
| 4430 | |
| 4431 | // Insert these operands into a valid topological order so they can |
| 4432 | // get selected independently. |
| 4433 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero); |
| 4434 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg); |
| 4435 | } else |
| 4436 | return false; |
| 4437 | } else |
| 4438 | return false; |
| 4439 | |
| 4440 | if (NewShiftAmt.getValueType() != MVT::i8) { |
| 4441 | // Need to truncate the shift amount. |
| 4442 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt); |
| 4443 | // Add to a correct topological ordering. |
| 4444 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
| 4445 | } |
| 4446 | |
| 4447 | // Insert a new mask to keep the shift amount legal. This should be removed |
| 4448 | // by isel patterns. |
| 4449 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt, |
| 4450 | N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8)); |
| 4451 | // Place in a correct topological ordering. |
| 4452 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
| 4453 | |
| 4454 | SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), |
| 4455 | Op2: NewShiftAmt); |
| 4456 | if (UpdatedNode != N) { |
| 4457 | // If we found an existing node, we should replace ourselves with that node |
| 4458 | // and wait for it to be selected after its other users. |
| 4459 | ReplaceNode(F: N, T: UpdatedNode); |
| 4460 | return true; |
| 4461 | } |
| 4462 | |
| 4463 | // If the original shift amount is now dead, delete it so that we don't run |
| 4464 | // it through isel. |
| 4465 | if (OrigShiftAmt.getNode()->use_empty()) |
| 4466 | CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode()); |
| 4467 | |
| 4468 | // Now that we've optimized the shift amount, defer to normal isel to get |
| 4469 | // load folding and legacy vs BMI2 selection without repeating it here. |
| 4470 | SelectCode(N); |
| 4471 | return true; |
| 4472 | } |
| 4473 | |
| 4474 | bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { |
| 4475 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
| 4476 | unsigned Opcode = N->getOpcode(); |
| 4477 | SDLoc dl(N); |
| 4478 | |
| 4479 | // For operations of the form (x << C1) op C2, check if we can use a smaller |
| 4480 | // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. |
| 4481 | SDValue Shift = N->getOperand(Num: 0); |
| 4482 | SDValue N1 = N->getOperand(Num: 1); |
| 4483 | |
| 4484 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
| 4485 | if (!Cst) |
| 4486 | return false; |
| 4487 | |
| 4488 | int64_t Val = Cst->getSExtValue(); |
| 4489 | |
| 4490 | // If we have an any_extend feeding the AND, look through it to see if there |
| 4491 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
| 4492 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
| 4493 | bool FoundAnyExtend = false; |
| 4494 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
| 4495 | Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 && |
| 4496 | isUInt<32>(x: Val)) { |
| 4497 | FoundAnyExtend = true; |
| 4498 | Shift = Shift.getOperand(i: 0); |
| 4499 | } |
| 4500 | |
| 4501 | if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) |
| 4502 | return false; |
| 4503 | |
| 4504 | // i8 is unshrinkable, i16 should be promoted to i32. |
| 4505 | if (NVT != MVT::i32 && NVT != MVT::i64) |
| 4506 | return false; |
| 4507 | |
| 4508 | auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)); |
| 4509 | if (!ShlCst) |
| 4510 | return false; |
| 4511 | |
| 4512 | uint64_t ShAmt = ShlCst->getZExtValue(); |
| 4513 | |
| 4514 | // Make sure that we don't change the operation by removing bits. |
| 4515 | // This only matters for OR and XOR, AND is unaffected. |
| 4516 | uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; |
| 4517 | if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) |
| 4518 | return false; |
| 4519 | |
| 4520 | // Check the minimum bitwidth for the new constant. |
| 4521 | // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. |
| 4522 | auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { |
| 4523 | if (Opcode == ISD::AND) { |
| 4524 | // AND32ri is the same as AND64ri32 with zext imm. |
| 4525 | // Try this before sign extended immediates below. |
| 4526 | ShiftedVal = (uint64_t)Val >> ShAmt; |
| 4527 | if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal)) |
| 4528 | return true; |
| 4529 | // Also swap order when the AND can become MOVZX. |
| 4530 | if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) |
| 4531 | return true; |
| 4532 | } |
| 4533 | ShiftedVal = Val >> ShAmt; |
| 4534 | if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) || |
| 4535 | (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal))) |
| 4536 | return true; |
| 4537 | if (Opcode != ISD::AND) { |
| 4538 | // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr |
| 4539 | ShiftedVal = (uint64_t)Val >> ShAmt; |
| 4540 | if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal)) |
| 4541 | return true; |
| 4542 | } |
| 4543 | return false; |
| 4544 | }; |
| 4545 | |
| 4546 | int64_t ShiftedVal; |
| 4547 | if (!CanShrinkImmediate(ShiftedVal)) |
| 4548 | return false; |
| 4549 | |
| 4550 | // Ok, we can reorder to get a smaller immediate. |
| 4551 | |
| 4552 | // But, its possible the original immediate allowed an AND to become MOVZX. |
| 4553 | // Doing this late due to avoid the MakedValueIsZero call as late as |
| 4554 | // possible. |
| 4555 | if (Opcode == ISD::AND) { |
| 4556 | // Find the smallest zext this could possibly be. |
| 4557 | unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); |
| 4558 | ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U)); |
| 4559 | |
| 4560 | // Figure out which bits need to be zero to achieve that mask. |
| 4561 | APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(), |
| 4562 | loBitsSet: ZExtWidth); |
| 4563 | NeededMask &= ~Cst->getAPIntValue(); |
| 4564 | |
| 4565 | if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask)) |
| 4566 | return false; |
| 4567 | } |
| 4568 | |
| 4569 | SDValue X = Shift.getOperand(i: 0); |
| 4570 | if (FoundAnyExtend) { |
| 4571 | SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X); |
| 4572 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX); |
| 4573 | X = NewX; |
| 4574 | } |
| 4575 | |
| 4576 | SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT); |
| 4577 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst); |
| 4578 | SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst); |
| 4579 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp); |
| 4580 | SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp, |
| 4581 | N2: Shift.getOperand(i: 1)); |
| 4582 | ReplaceNode(F: N, T: NewSHL.getNode()); |
| 4583 | SelectCode(N: NewSHL.getNode()); |
| 4584 | return true; |
| 4585 | } |
| 4586 | |
| 4587 | bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, |
| 4588 | SDNode *ParentB, SDNode *ParentC, |
| 4589 | SDValue A, SDValue B, SDValue C, |
| 4590 | uint8_t Imm) { |
| 4591 | assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) && |
| 4592 | C.isOperandOf(ParentC) && "Incorrect parent node" ); |
| 4593 | |
| 4594 | auto tryFoldLoadOrBCast = |
| 4595 | [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, |
| 4596 | SDValue &Index, SDValue &Disp, SDValue &Segment) { |
| 4597 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
| 4598 | return true; |
| 4599 | |
| 4600 | // Not a load, check for broadcast which may be behind a bitcast. |
| 4601 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
| 4602 | P = L.getNode(); |
| 4603 | L = L.getOperand(i: 0); |
| 4604 | } |
| 4605 | |
| 4606 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
| 4607 | return false; |
| 4608 | |
| 4609 | // Only 32 and 64 bit broadcasts are supported. |
| 4610 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
| 4611 | unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); |
| 4612 | if (Size != 32 && Size != 64) |
| 4613 | return false; |
| 4614 | |
| 4615 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
| 4616 | }; |
| 4617 | |
| 4618 | bool FoldedLoad = false; |
| 4619 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 4620 | if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { |
| 4621 | FoldedLoad = true; |
| 4622 | } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, |
| 4623 | Tmp4)) { |
| 4624 | FoldedLoad = true; |
| 4625 | std::swap(a&: A, b&: C); |
| 4626 | // Swap bits 1/4 and 3/6. |
| 4627 | uint8_t OldImm = Imm; |
| 4628 | Imm = OldImm & 0xa5; |
| 4629 | if (OldImm & 0x02) Imm |= 0x10; |
| 4630 | if (OldImm & 0x10) Imm |= 0x02; |
| 4631 | if (OldImm & 0x08) Imm |= 0x40; |
| 4632 | if (OldImm & 0x40) Imm |= 0x08; |
| 4633 | } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3, |
| 4634 | Tmp4)) { |
| 4635 | FoldedLoad = true; |
| 4636 | std::swap(a&: B, b&: C); |
| 4637 | // Swap bits 1/2 and 5/6. |
| 4638 | uint8_t OldImm = Imm; |
| 4639 | Imm = OldImm & 0x99; |
| 4640 | if (OldImm & 0x02) Imm |= 0x04; |
| 4641 | if (OldImm & 0x04) Imm |= 0x02; |
| 4642 | if (OldImm & 0x20) Imm |= 0x40; |
| 4643 | if (OldImm & 0x40) Imm |= 0x20; |
| 4644 | } |
| 4645 | |
| 4646 | SDLoc DL(Root); |
| 4647 | |
| 4648 | SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8); |
| 4649 | |
| 4650 | MVT NVT = Root->getSimpleValueType(ResNo: 0); |
| 4651 | |
| 4652 | MachineSDNode *MNode; |
| 4653 | if (FoldedLoad) { |
| 4654 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other); |
| 4655 | |
| 4656 | unsigned Opc; |
| 4657 | if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { |
| 4658 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C); |
| 4659 | unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); |
| 4660 | assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!" ); |
| 4661 | |
| 4662 | bool UseD = EltSize == 32; |
| 4663 | if (NVT.is128BitVector()) |
| 4664 | Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; |
| 4665 | else if (NVT.is256BitVector()) |
| 4666 | Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; |
| 4667 | else if (NVT.is512BitVector()) |
| 4668 | Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; |
| 4669 | else |
| 4670 | llvm_unreachable("Unexpected vector size!" ); |
| 4671 | } else { |
| 4672 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
| 4673 | if (NVT.is128BitVector()) |
| 4674 | Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; |
| 4675 | else if (NVT.is256BitVector()) |
| 4676 | Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; |
| 4677 | else if (NVT.is512BitVector()) |
| 4678 | Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; |
| 4679 | else |
| 4680 | llvm_unreachable("Unexpected vector size!" ); |
| 4681 | } |
| 4682 | |
| 4683 | SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)}; |
| 4684 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops); |
| 4685 | |
| 4686 | // Update the chain. |
| 4687 | ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1)); |
| 4688 | // Record the mem-refs |
| 4689 | CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()}); |
| 4690 | } else { |
| 4691 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
| 4692 | unsigned Opc; |
| 4693 | if (NVT.is128BitVector()) |
| 4694 | Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; |
| 4695 | else if (NVT.is256BitVector()) |
| 4696 | Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; |
| 4697 | else if (NVT.is512BitVector()) |
| 4698 | Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; |
| 4699 | else |
| 4700 | llvm_unreachable("Unexpected vector size!" ); |
| 4701 | |
| 4702 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm}); |
| 4703 | } |
| 4704 | |
| 4705 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0)); |
| 4706 | CurDAG->RemoveDeadNode(N: Root); |
| 4707 | return true; |
| 4708 | } |
| 4709 | |
| 4710 | // Try to match two logic ops to a VPTERNLOG. |
| 4711 | // FIXME: Handle more complex patterns that use an operand more than once? |
| 4712 | bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { |
| 4713 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
| 4714 | |
| 4715 | // Make sure we support VPTERNLOG. |
| 4716 | if (!NVT.isVector() || !Subtarget->hasAVX512() || |
| 4717 | NVT.getVectorElementType() == MVT::i1) |
| 4718 | return false; |
| 4719 | |
| 4720 | // We need VLX for 128/256-bit. |
| 4721 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
| 4722 | return false; |
| 4723 | |
| 4724 | SDValue N0 = N->getOperand(Num: 0); |
| 4725 | SDValue N1 = N->getOperand(Num: 1); |
| 4726 | |
| 4727 | auto getFoldableLogicOp = [](SDValue Op) { |
| 4728 | // Peek through single use bitcast. |
| 4729 | if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) |
| 4730 | Op = Op.getOperand(i: 0); |
| 4731 | |
| 4732 | if (!Op.hasOneUse()) |
| 4733 | return SDValue(); |
| 4734 | |
| 4735 | unsigned Opc = Op.getOpcode(); |
| 4736 | if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || |
| 4737 | Opc == X86ISD::ANDNP) |
| 4738 | return Op; |
| 4739 | |
| 4740 | return SDValue(); |
| 4741 | }; |
| 4742 | |
| 4743 | SDValue A, FoldableOp; |
| 4744 | if ((FoldableOp = getFoldableLogicOp(N1))) { |
| 4745 | A = N0; |
| 4746 | } else if ((FoldableOp = getFoldableLogicOp(N0))) { |
| 4747 | A = N1; |
| 4748 | } else |
| 4749 | return false; |
| 4750 | |
| 4751 | SDValue B = FoldableOp.getOperand(i: 0); |
| 4752 | SDValue C = FoldableOp.getOperand(i: 1); |
| 4753 | SDNode *ParentA = N; |
| 4754 | SDNode *ParentB = FoldableOp.getNode(); |
| 4755 | SDNode *ParentC = FoldableOp.getNode(); |
| 4756 | |
| 4757 | // We can build the appropriate control immediate by performing the logic |
| 4758 | // operation we're matching using these constants for A, B, and C. |
| 4759 | uint8_t TernlogMagicA = 0xf0; |
| 4760 | uint8_t TernlogMagicB = 0xcc; |
| 4761 | uint8_t TernlogMagicC = 0xaa; |
| 4762 | |
| 4763 | // Some of the inputs may be inverted, peek through them and invert the |
| 4764 | // magic values accordingly. |
| 4765 | // TODO: There may be a bitcast before the xor that we should peek through. |
| 4766 | auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) { |
| 4767 | if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() && |
| 4768 | ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) { |
| 4769 | Magic = ~Magic; |
| 4770 | Parent = Op.getNode(); |
| 4771 | Op = Op.getOperand(i: 0); |
| 4772 | } |
| 4773 | }; |
| 4774 | |
| 4775 | PeekThroughNot(A, ParentA, TernlogMagicA); |
| 4776 | PeekThroughNot(B, ParentB, TernlogMagicB); |
| 4777 | PeekThroughNot(C, ParentC, TernlogMagicC); |
| 4778 | |
| 4779 | uint8_t Imm; |
| 4780 | switch (FoldableOp.getOpcode()) { |
| 4781 | default: llvm_unreachable("Unexpected opcode!" ); |
| 4782 | case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; |
| 4783 | case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; |
| 4784 | case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; |
| 4785 | case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; |
| 4786 | } |
| 4787 | |
| 4788 | switch (N->getOpcode()) { |
| 4789 | default: llvm_unreachable("Unexpected opcode!" ); |
| 4790 | case X86ISD::ANDNP: |
| 4791 | if (A == N0) |
| 4792 | Imm &= ~TernlogMagicA; |
| 4793 | else |
| 4794 | Imm = ~(Imm) & TernlogMagicA; |
| 4795 | break; |
| 4796 | case ISD::AND: Imm &= TernlogMagicA; break; |
| 4797 | case ISD::OR: Imm |= TernlogMagicA; break; |
| 4798 | case ISD::XOR: Imm ^= TernlogMagicA; break; |
| 4799 | } |
| 4800 | |
| 4801 | return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm); |
| 4802 | } |
| 4803 | |
| 4804 | /// If the high bits of an 'and' operand are known zero, try setting the |
| 4805 | /// high bits of an 'and' constant operand to produce a smaller encoding by |
| 4806 | /// creating a small, sign-extended negative immediate rather than a large |
| 4807 | /// positive one. This reverses a transform in SimplifyDemandedBits that |
| 4808 | /// shrinks mask constants by clearing bits. There is also a possibility that |
| 4809 | /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that |
| 4810 | /// case, just replace the 'and'. Return 'true' if the node is replaced. |
| 4811 | bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { |
| 4812 | // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't |
| 4813 | // have immediate operands. |
| 4814 | MVT VT = And->getSimpleValueType(ResNo: 0); |
| 4815 | if (VT != MVT::i32 && VT != MVT::i64) |
| 4816 | return false; |
| 4817 | |
| 4818 | auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1)); |
| 4819 | if (!And1C) |
| 4820 | return false; |
| 4821 | |
| 4822 | // Bail out if the mask constant is already negative. It's can't shrink more. |
| 4823 | // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel |
| 4824 | // patterns to use a 32-bit and instead of a 64-bit and by relying on the |
| 4825 | // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits |
| 4826 | // are negative too. |
| 4827 | APInt MaskVal = And1C->getAPIntValue(); |
| 4828 | unsigned MaskLZ = MaskVal.countl_zero(); |
| 4829 | if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) |
| 4830 | return false; |
| 4831 | |
| 4832 | // Don't extend into the upper 32 bits of a 64 bit mask. |
| 4833 | if (VT == MVT::i64 && MaskLZ >= 32) { |
| 4834 | MaskLZ -= 32; |
| 4835 | MaskVal = MaskVal.trunc(width: 32); |
| 4836 | } |
| 4837 | |
| 4838 | SDValue And0 = And->getOperand(Num: 0); |
| 4839 | APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ); |
| 4840 | APInt NegMaskVal = MaskVal | HighZeros; |
| 4841 | |
| 4842 | // If a negative constant would not allow a smaller encoding, there's no need |
| 4843 | // to continue. Only change the constant when we know it's a win. |
| 4844 | unsigned MinWidth = NegMaskVal.getSignificantBits(); |
| 4845 | if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32)) |
| 4846 | return false; |
| 4847 | |
| 4848 | // Extend masks if we truncated above. |
| 4849 | if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { |
| 4850 | NegMaskVal = NegMaskVal.zext(width: 64); |
| 4851 | HighZeros = HighZeros.zext(width: 64); |
| 4852 | } |
| 4853 | |
| 4854 | // The variable operand must be all zeros in the top bits to allow using the |
| 4855 | // new, negative constant as the mask. |
| 4856 | // TODO: Handle constant folding? |
| 4857 | KnownBits Known0 = CurDAG->computeKnownBits(Op: And0); |
| 4858 | if (Known0.isConstant() || !HighZeros.isSubsetOf(RHS: Known0.Zero)) |
| 4859 | return false; |
| 4860 | |
| 4861 | // Check if the mask is -1. In that case, this is an unnecessary instruction |
| 4862 | // that escaped earlier analysis. |
| 4863 | if (NegMaskVal.isAllOnes()) { |
| 4864 | ReplaceNode(F: And, T: And0.getNode()); |
| 4865 | return true; |
| 4866 | } |
| 4867 | |
| 4868 | // A negative mask allows a smaller encoding. Create a new 'and' node. |
| 4869 | SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT); |
| 4870 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask); |
| 4871 | SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask); |
| 4872 | ReplaceNode(F: And, T: NewAnd.getNode()); |
| 4873 | SelectCode(N: NewAnd.getNode()); |
| 4874 | return true; |
| 4875 | } |
| 4876 | |
| 4877 | static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, |
| 4878 | bool FoldedBCast, bool Masked) { |
| 4879 | #define VPTESTM_CASE(VT, SUFFIX) \ |
| 4880 | case MVT::VT: \ |
| 4881 | if (Masked) \ |
| 4882 | return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ |
| 4883 | return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; |
| 4884 | |
| 4885 | |
| 4886 | #define VPTESTM_BROADCAST_CASES(SUFFIX) \ |
| 4887 | default: llvm_unreachable("Unexpected VT!"); \ |
| 4888 | VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ |
| 4889 | VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ |
| 4890 | VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ |
| 4891 | VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ |
| 4892 | VPTESTM_CASE(v16i32, DZ##SUFFIX) \ |
| 4893 | VPTESTM_CASE(v8i64, QZ##SUFFIX) |
| 4894 | |
| 4895 | #define VPTESTM_FULL_CASES(SUFFIX) \ |
| 4896 | VPTESTM_BROADCAST_CASES(SUFFIX) \ |
| 4897 | VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ |
| 4898 | VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ |
| 4899 | VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ |
| 4900 | VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ |
| 4901 | VPTESTM_CASE(v64i8, BZ##SUFFIX) \ |
| 4902 | VPTESTM_CASE(v32i16, WZ##SUFFIX) |
| 4903 | |
| 4904 | if (FoldedBCast) { |
| 4905 | switch (TestVT.SimpleTy) { |
| 4906 | VPTESTM_BROADCAST_CASES(rmb) |
| 4907 | } |
| 4908 | } |
| 4909 | |
| 4910 | if (FoldedLoad) { |
| 4911 | switch (TestVT.SimpleTy) { |
| 4912 | VPTESTM_FULL_CASES(rm) |
| 4913 | } |
| 4914 | } |
| 4915 | |
| 4916 | switch (TestVT.SimpleTy) { |
| 4917 | VPTESTM_FULL_CASES(rr) |
| 4918 | } |
| 4919 | |
| 4920 | #undef VPTESTM_FULL_CASES |
| 4921 | #undef VPTESTM_BROADCAST_CASES |
| 4922 | #undef VPTESTM_CASE |
| 4923 | } |
| 4924 | |
| 4925 | // Try to create VPTESTM instruction. If InMask is not null, it will be used |
| 4926 | // to form a masked operation. |
| 4927 | bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, |
| 4928 | SDValue InMask) { |
| 4929 | assert(Subtarget->hasAVX512() && "Expected AVX512!" ); |
| 4930 | assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && |
| 4931 | "Unexpected VT!" ); |
| 4932 | |
| 4933 | // Look for equal and not equal compares. |
| 4934 | ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get(); |
| 4935 | if (CC != ISD::SETEQ && CC != ISD::SETNE) |
| 4936 | return false; |
| 4937 | |
| 4938 | SDValue SetccOp0 = Setcc.getOperand(i: 0); |
| 4939 | SDValue SetccOp1 = Setcc.getOperand(i: 1); |
| 4940 | |
| 4941 | // Canonicalize the all zero vector to the RHS. |
| 4942 | if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode())) |
| 4943 | std::swap(a&: SetccOp0, b&: SetccOp1); |
| 4944 | |
| 4945 | // See if we're comparing against zero. |
| 4946 | if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode())) |
| 4947 | return false; |
| 4948 | |
| 4949 | SDValue N0 = SetccOp0; |
| 4950 | |
| 4951 | MVT CmpVT = N0.getSimpleValueType(); |
| 4952 | MVT CmpSVT = CmpVT.getVectorElementType(); |
| 4953 | |
| 4954 | // Start with both operands the same. We'll try to refine this. |
| 4955 | SDValue Src0 = N0; |
| 4956 | SDValue Src1 = N0; |
| 4957 | |
| 4958 | { |
| 4959 | // Look through single use bitcasts. |
| 4960 | SDValue N0Temp = N0; |
| 4961 | if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) |
| 4962 | N0Temp = N0.getOperand(i: 0); |
| 4963 | |
| 4964 | // Look for single use AND. |
| 4965 | if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { |
| 4966 | Src0 = N0Temp.getOperand(i: 0); |
| 4967 | Src1 = N0Temp.getOperand(i: 1); |
| 4968 | } |
| 4969 | } |
| 4970 | |
| 4971 | // Without VLX we need to widen the operation. |
| 4972 | bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); |
| 4973 | |
| 4974 | auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, |
| 4975 | SDValue &Base, SDValue &Scale, SDValue &Index, |
| 4976 | SDValue &Disp, SDValue &Segment) { |
| 4977 | // If we need to widen, we can't fold the load. |
| 4978 | if (!Widen) |
| 4979 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
| 4980 | return true; |
| 4981 | |
| 4982 | // If we didn't fold a load, try to match broadcast. No widening limitation |
| 4983 | // for this. But only 32 and 64 bit types are supported. |
| 4984 | if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) |
| 4985 | return false; |
| 4986 | |
| 4987 | // Look through single use bitcasts. |
| 4988 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
| 4989 | P = L.getNode(); |
| 4990 | L = L.getOperand(i: 0); |
| 4991 | } |
| 4992 | |
| 4993 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
| 4994 | return false; |
| 4995 | |
| 4996 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
| 4997 | if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) |
| 4998 | return false; |
| 4999 | |
| 5000 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
| 5001 | }; |
| 5002 | |
| 5003 | // We can only fold loads if the sources are unique. |
| 5004 | bool CanFoldLoads = Src0 != Src1; |
| 5005 | |
| 5006 | bool FoldedLoad = false; |
| 5007 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 5008 | if (CanFoldLoads) { |
| 5009 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, |
| 5010 | Tmp3, Tmp4); |
| 5011 | if (!FoldedLoad) { |
| 5012 | // And is commutative. |
| 5013 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, |
| 5014 | Tmp2, Tmp3, Tmp4); |
| 5015 | if (FoldedLoad) |
| 5016 | std::swap(a&: Src0, b&: Src1); |
| 5017 | } |
| 5018 | } |
| 5019 | |
| 5020 | bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; |
| 5021 | |
| 5022 | bool IsMasked = InMask.getNode() != nullptr; |
| 5023 | |
| 5024 | SDLoc dl(Root); |
| 5025 | |
| 5026 | MVT ResVT = Setcc.getSimpleValueType(); |
| 5027 | MVT MaskVT = ResVT; |
| 5028 | if (Widen) { |
| 5029 | // Widen the inputs using insert_subreg or copy_to_regclass. |
| 5030 | unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; |
| 5031 | unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; |
| 5032 | unsigned NumElts = CmpVT.getVectorNumElements() * Scale; |
| 5033 | CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts); |
| 5034 | MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts); |
| 5035 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl, |
| 5036 | VT: CmpVT), 0); |
| 5037 | Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0); |
| 5038 | |
| 5039 | if (!FoldedBCast) |
| 5040 | Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1); |
| 5041 | |
| 5042 | if (IsMasked) { |
| 5043 | // Widen the mask. |
| 5044 | unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID(); |
| 5045 | SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32); |
| 5046 | InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
| 5047 | dl, VT: MaskVT, Op1: InMask, Op2: RC), 0); |
| 5048 | } |
| 5049 | } |
| 5050 | |
| 5051 | bool IsTestN = CC == ISD::SETEQ; |
| 5052 | unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast, |
| 5053 | Masked: IsMasked); |
| 5054 | |
| 5055 | MachineSDNode *CNode; |
| 5056 | if (FoldedLoad) { |
| 5057 | SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other); |
| 5058 | |
| 5059 | if (IsMasked) { |
| 5060 | SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
| 5061 | Src1.getOperand(i: 0) }; |
| 5062 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
| 5063 | } else { |
| 5064 | SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
| 5065 | Src1.getOperand(i: 0) }; |
| 5066 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
| 5067 | } |
| 5068 | |
| 5069 | // Update the chain. |
| 5070 | ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1)); |
| 5071 | // Record the mem-refs |
| 5072 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()}); |
| 5073 | } else { |
| 5074 | if (IsMasked) |
| 5075 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1); |
| 5076 | else |
| 5077 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1); |
| 5078 | } |
| 5079 | |
| 5080 | // If we widened, we need to shrink the mask VT. |
| 5081 | if (Widen) { |
| 5082 | unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID(); |
| 5083 | SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32); |
| 5084 | CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
| 5085 | dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC); |
| 5086 | } |
| 5087 | |
| 5088 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0)); |
| 5089 | CurDAG->RemoveDeadNode(N: Root); |
| 5090 | return true; |
| 5091 | } |
| 5092 | |
| 5093 | // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it |
| 5094 | // into vpternlog. |
| 5095 | bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { |
| 5096 | assert(N->getOpcode() == ISD::OR && "Unexpected opcode!" ); |
| 5097 | |
| 5098 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
| 5099 | |
| 5100 | // Make sure we support VPTERNLOG. |
| 5101 | if (!NVT.isVector() || !Subtarget->hasAVX512()) |
| 5102 | return false; |
| 5103 | |
| 5104 | // We need VLX for 128/256-bit. |
| 5105 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
| 5106 | return false; |
| 5107 | |
| 5108 | SDValue N0 = N->getOperand(Num: 0); |
| 5109 | SDValue N1 = N->getOperand(Num: 1); |
| 5110 | |
| 5111 | // Canonicalize AND to LHS. |
| 5112 | if (N1.getOpcode() == ISD::AND) |
| 5113 | std::swap(a&: N0, b&: N1); |
| 5114 | |
| 5115 | if (N0.getOpcode() != ISD::AND || |
| 5116 | N1.getOpcode() != X86ISD::ANDNP || |
| 5117 | !N0.hasOneUse() || !N1.hasOneUse()) |
| 5118 | return false; |
| 5119 | |
| 5120 | // ANDN is not commutable, use it to pick down A and C. |
| 5121 | SDValue A = N1.getOperand(i: 0); |
| 5122 | SDValue C = N1.getOperand(i: 1); |
| 5123 | |
| 5124 | // AND is commutable, if one operand matches A, the other operand is B. |
| 5125 | // Otherwise this isn't a match. |
| 5126 | SDValue B; |
| 5127 | if (N0.getOperand(i: 0) == A) |
| 5128 | B = N0.getOperand(i: 1); |
| 5129 | else if (N0.getOperand(i: 1) == A) |
| 5130 | B = N0.getOperand(i: 0); |
| 5131 | else |
| 5132 | return false; |
| 5133 | |
| 5134 | SDLoc dl(N); |
| 5135 | SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8); |
| 5136 | SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm); |
| 5137 | ReplaceNode(F: N, T: Ternlog.getNode()); |
| 5138 | |
| 5139 | return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(), |
| 5140 | ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA); |
| 5141 | } |
| 5142 | |
| 5143 | void X86DAGToDAGISel::Select(SDNode *Node) { |
| 5144 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
| 5145 | unsigned Opcode = Node->getOpcode(); |
| 5146 | SDLoc dl(Node); |
| 5147 | |
| 5148 | if (Node->isMachineOpcode()) { |
| 5149 | LLVM_DEBUG(dbgs() << "== " ; Node->dump(CurDAG); dbgs() << '\n'); |
| 5150 | Node->setNodeId(-1); |
| 5151 | return; // Already selected. |
| 5152 | } |
| 5153 | |
| 5154 | switch (Opcode) { |
| 5155 | default: break; |
| 5156 | case ISD::INTRINSIC_W_CHAIN: { |
| 5157 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
| 5158 | switch (IntNo) { |
| 5159 | default: break; |
| 5160 | case Intrinsic::x86_encodekey128: |
| 5161 | case Intrinsic::x86_encodekey256: { |
| 5162 | if (!Subtarget->hasKL()) |
| 5163 | break; |
| 5164 | |
| 5165 | unsigned Opcode; |
| 5166 | switch (IntNo) { |
| 5167 | default: llvm_unreachable("Impossible intrinsic" ); |
| 5168 | case Intrinsic::x86_encodekey128: |
| 5169 | Opcode = X86::ENCODEKEY128; |
| 5170 | break; |
| 5171 | case Intrinsic::x86_encodekey256: |
| 5172 | Opcode = X86::ENCODEKEY256; |
| 5173 | break; |
| 5174 | } |
| 5175 | |
| 5176 | SDValue Chain = Node->getOperand(Num: 0); |
| 5177 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3), |
| 5178 | Glue: SDValue()); |
| 5179 | if (Opcode == X86::ENCODEKEY256) |
| 5180 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4), |
| 5181 | Glue: Chain.getValue(R: 1)); |
| 5182 | |
| 5183 | MachineSDNode *Res = CurDAG->getMachineNode( |
| 5184 | Opcode, dl, VTs: Node->getVTList(), |
| 5185 | Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)}); |
| 5186 | ReplaceNode(F: Node, T: Res); |
| 5187 | return; |
| 5188 | } |
| 5189 | case Intrinsic::x86_tileloaddrs64_internal: |
| 5190 | case Intrinsic::x86_tileloaddrst164_internal: |
| 5191 | if (!Subtarget->hasAMXMOVRS()) |
| 5192 | break; |
| 5193 | [[fallthrough]]; |
| 5194 | case Intrinsic::x86_tileloadd64_internal: |
| 5195 | case Intrinsic::x86_tileloaddt164_internal: { |
| 5196 | if (!Subtarget->hasAMXTILE()) |
| 5197 | break; |
| 5198 | auto *MFI = |
| 5199 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
| 5200 | MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); |
| 5201 | unsigned Opc; |
| 5202 | switch (IntNo) { |
| 5203 | default: |
| 5204 | llvm_unreachable("Unexpected intrinsic!" ); |
| 5205 | case Intrinsic::x86_tileloaddrs64_internal: |
| 5206 | Opc = X86::PTILELOADDRSV; |
| 5207 | break; |
| 5208 | case Intrinsic::x86_tileloaddrst164_internal: |
| 5209 | Opc = X86::PTILELOADDRST1V; |
| 5210 | break; |
| 5211 | case Intrinsic::x86_tileloadd64_internal: |
| 5212 | Opc = X86::PTILELOADDV; |
| 5213 | break; |
| 5214 | case Intrinsic::x86_tileloaddt164_internal: |
| 5215 | Opc = X86::PTILELOADDT1V; |
| 5216 | break; |
| 5217 | } |
| 5218 | // _tile_loadd_internal(row, col, buf, STRIDE) |
| 5219 | SDValue Base = Node->getOperand(Num: 4); |
| 5220 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
| 5221 | SDValue Index = Node->getOperand(Num: 5); |
| 5222 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
| 5223 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
| 5224 | SDValue Chain = Node->getOperand(Num: 0); |
| 5225 | MachineSDNode *CNode; |
| 5226 | SDValue Ops[] = {Node->getOperand(Num: 2), |
| 5227 | Node->getOperand(Num: 3), |
| 5228 | Base, |
| 5229 | Scale, |
| 5230 | Index, |
| 5231 | Disp, |
| 5232 | Segment, |
| 5233 | Chain}; |
| 5234 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops); |
| 5235 | ReplaceNode(F: Node, T: CNode); |
| 5236 | return; |
| 5237 | } |
| 5238 | } |
| 5239 | break; |
| 5240 | } |
| 5241 | case ISD::INTRINSIC_VOID: { |
| 5242 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
| 5243 | switch (IntNo) { |
| 5244 | default: break; |
| 5245 | case Intrinsic::x86_sse3_monitor: |
| 5246 | case Intrinsic::x86_monitorx: |
| 5247 | case Intrinsic::x86_clzero: { |
| 5248 | bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64; |
| 5249 | |
| 5250 | unsigned Opc = 0; |
| 5251 | switch (IntNo) { |
| 5252 | default: llvm_unreachable("Unexpected intrinsic!" ); |
| 5253 | case Intrinsic::x86_sse3_monitor: |
| 5254 | if (!Subtarget->hasSSE3()) |
| 5255 | break; |
| 5256 | Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; |
| 5257 | break; |
| 5258 | case Intrinsic::x86_monitorx: |
| 5259 | if (!Subtarget->hasMWAITX()) |
| 5260 | break; |
| 5261 | Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; |
| 5262 | break; |
| 5263 | case Intrinsic::x86_clzero: |
| 5264 | if (!Subtarget->hasCLZERO()) |
| 5265 | break; |
| 5266 | Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; |
| 5267 | break; |
| 5268 | } |
| 5269 | |
| 5270 | if (Opc) { |
| 5271 | unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; |
| 5272 | SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg, |
| 5273 | N: Node->getOperand(Num: 2), Glue: SDValue()); |
| 5274 | SDValue InGlue = Chain.getValue(R: 1); |
| 5275 | |
| 5276 | if (IntNo == Intrinsic::x86_sse3_monitor || |
| 5277 | IntNo == Intrinsic::x86_monitorx) { |
| 5278 | // Copy the other two operands to ECX and EDX. |
| 5279 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3), |
| 5280 | Glue: InGlue); |
| 5281 | InGlue = Chain.getValue(R: 1); |
| 5282 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4), |
| 5283 | Glue: InGlue); |
| 5284 | InGlue = Chain.getValue(R: 1); |
| 5285 | } |
| 5286 | |
| 5287 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, |
| 5288 | Ops: { Chain, InGlue}); |
| 5289 | ReplaceNode(F: Node, T: CNode); |
| 5290 | return; |
| 5291 | } |
| 5292 | |
| 5293 | break; |
| 5294 | } |
| 5295 | case Intrinsic::x86_tilestored64_internal: { |
| 5296 | auto *MFI = |
| 5297 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
| 5298 | MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); |
| 5299 | unsigned Opc = X86::PTILESTOREDV; |
| 5300 | // _tile_stored_internal(row, col, buf, STRIDE, c) |
| 5301 | SDValue Base = Node->getOperand(Num: 4); |
| 5302 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
| 5303 | SDValue Index = Node->getOperand(Num: 5); |
| 5304 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
| 5305 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
| 5306 | SDValue Chain = Node->getOperand(Num: 0); |
| 5307 | MachineSDNode *CNode; |
| 5308 | SDValue Ops[] = {Node->getOperand(Num: 2), |
| 5309 | Node->getOperand(Num: 3), |
| 5310 | Base, |
| 5311 | Scale, |
| 5312 | Index, |
| 5313 | Disp, |
| 5314 | Segment, |
| 5315 | Node->getOperand(Num: 6), |
| 5316 | Chain}; |
| 5317 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
| 5318 | ReplaceNode(F: Node, T: CNode); |
| 5319 | return; |
| 5320 | } |
| 5321 | case Intrinsic::x86_tileloaddrs64: |
| 5322 | case Intrinsic::x86_tileloaddrst164: |
| 5323 | if (!Subtarget->hasAMXMOVRS()) |
| 5324 | break; |
| 5325 | [[fallthrough]]; |
| 5326 | case Intrinsic::x86_tileloadd64: |
| 5327 | case Intrinsic::x86_tileloaddt164: |
| 5328 | case Intrinsic::x86_tilestored64: { |
| 5329 | if (!Subtarget->hasAMXTILE()) |
| 5330 | break; |
| 5331 | auto *MFI = |
| 5332 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
| 5333 | MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); |
| 5334 | unsigned Opc; |
| 5335 | switch (IntNo) { |
| 5336 | default: llvm_unreachable("Unexpected intrinsic!" ); |
| 5337 | case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; |
| 5338 | case Intrinsic::x86_tileloaddrs64: |
| 5339 | Opc = X86::PTILELOADDRS; |
| 5340 | break; |
| 5341 | case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; |
| 5342 | case Intrinsic::x86_tileloaddrst164: |
| 5343 | Opc = X86::PTILELOADDRST1; |
| 5344 | break; |
| 5345 | case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; |
| 5346 | } |
| 5347 | // FIXME: Match displacement and scale. |
| 5348 | unsigned TIndex = Node->getConstantOperandVal(Num: 2); |
| 5349 | SDValue TReg = getI8Imm(Imm: TIndex, DL: dl); |
| 5350 | SDValue Base = Node->getOperand(Num: 3); |
| 5351 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
| 5352 | SDValue Index = Node->getOperand(Num: 4); |
| 5353 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
| 5354 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
| 5355 | SDValue Chain = Node->getOperand(Num: 0); |
| 5356 | MachineSDNode *CNode; |
| 5357 | if (Opc == X86::PTILESTORED) { |
| 5358 | SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; |
| 5359 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
| 5360 | } else { |
| 5361 | SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; |
| 5362 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
| 5363 | } |
| 5364 | ReplaceNode(F: Node, T: CNode); |
| 5365 | return; |
| 5366 | } |
| 5367 | case Intrinsic::x86_t2rpntlvwz0rs: |
| 5368 | case Intrinsic::x86_t2rpntlvwz0rst1: |
| 5369 | case Intrinsic::x86_t2rpntlvwz1rs: |
| 5370 | case Intrinsic::x86_t2rpntlvwz1rst1: |
| 5371 | if (!Subtarget->hasAMXMOVRS()) |
| 5372 | break; |
| 5373 | [[fallthrough]]; |
| 5374 | case Intrinsic::x86_t2rpntlvwz0: |
| 5375 | case Intrinsic::x86_t2rpntlvwz0t1: |
| 5376 | case Intrinsic::x86_t2rpntlvwz1: |
| 5377 | case Intrinsic::x86_t2rpntlvwz1t1: { |
| 5378 | if (!Subtarget->hasAMXTRANSPOSE()) |
| 5379 | break; |
| 5380 | auto *MFI = |
| 5381 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
| 5382 | MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); |
| 5383 | unsigned Opc; |
| 5384 | switch (IntNo) { |
| 5385 | default: |
| 5386 | llvm_unreachable("Unexpected intrinsic!" ); |
| 5387 | case Intrinsic::x86_t2rpntlvwz0: |
| 5388 | Opc = X86::PT2RPNTLVWZ0; |
| 5389 | break; |
| 5390 | case Intrinsic::x86_t2rpntlvwz0t1: |
| 5391 | Opc = X86::PT2RPNTLVWZ0T1; |
| 5392 | break; |
| 5393 | case Intrinsic::x86_t2rpntlvwz1: |
| 5394 | Opc = X86::PT2RPNTLVWZ1; |
| 5395 | break; |
| 5396 | case Intrinsic::x86_t2rpntlvwz1t1: |
| 5397 | Opc = X86::PT2RPNTLVWZ1T1; |
| 5398 | break; |
| 5399 | case Intrinsic::x86_t2rpntlvwz0rs: |
| 5400 | Opc = X86::PT2RPNTLVWZ0RS; |
| 5401 | break; |
| 5402 | case Intrinsic::x86_t2rpntlvwz0rst1: |
| 5403 | Opc = X86::PT2RPNTLVWZ0RST1; |
| 5404 | break; |
| 5405 | case Intrinsic::x86_t2rpntlvwz1rs: |
| 5406 | Opc = X86::PT2RPNTLVWZ1RS; |
| 5407 | break; |
| 5408 | case Intrinsic::x86_t2rpntlvwz1rst1: |
| 5409 | Opc = X86::PT2RPNTLVWZ1RST1; |
| 5410 | break; |
| 5411 | } |
| 5412 | // FIXME: Match displacement and scale. |
| 5413 | unsigned TIndex = Node->getConstantOperandVal(Num: 2); |
| 5414 | SDValue TReg = getI8Imm(Imm: TIndex, DL: dl); |
| 5415 | SDValue Base = Node->getOperand(Num: 3); |
| 5416 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
| 5417 | SDValue Index = Node->getOperand(Num: 4); |
| 5418 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
| 5419 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
| 5420 | SDValue Chain = Node->getOperand(Num: 0); |
| 5421 | SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; |
| 5422 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
| 5423 | ReplaceNode(F: Node, T: CNode); |
| 5424 | return; |
| 5425 | } |
| 5426 | } |
| 5427 | break; |
| 5428 | } |
| 5429 | case ISD::BRIND: |
| 5430 | case X86ISD::NT_BRIND: { |
| 5431 | if (Subtarget->isTargetNaCl()) |
| 5432 | // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We |
| 5433 | // leave the instruction alone. |
| 5434 | break; |
| 5435 | if (Subtarget->isTarget64BitILP32()) { |
| 5436 | // Converts a 32-bit register to a 64-bit, zero-extended version of |
| 5437 | // it. This is needed because x86-64 can do many things, but jmp %r32 |
| 5438 | // ain't one of them. |
| 5439 | SDValue Target = Node->getOperand(Num: 1); |
| 5440 | assert(Target.getValueType() == MVT::i32 && "Unexpected VT!" ); |
| 5441 | SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64); |
| 5442 | SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other, |
| 5443 | N1: Node->getOperand(Num: 0), N2: ZextTarget); |
| 5444 | ReplaceNode(F: Node, T: Brind.getNode()); |
| 5445 | SelectCode(N: ZextTarget.getNode()); |
| 5446 | SelectCode(N: Brind.getNode()); |
| 5447 | return; |
| 5448 | } |
| 5449 | break; |
| 5450 | } |
| 5451 | case X86ISD::GlobalBaseReg: |
| 5452 | ReplaceNode(F: Node, T: getGlobalBaseReg()); |
| 5453 | return; |
| 5454 | |
| 5455 | case ISD::BITCAST: |
| 5456 | // Just drop all 128/256/512-bit bitcasts. |
| 5457 | if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || |
| 5458 | NVT == MVT::f128) { |
| 5459 | ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0)); |
| 5460 | CurDAG->RemoveDeadNode(N: Node); |
| 5461 | return; |
| 5462 | } |
| 5463 | break; |
| 5464 | |
| 5465 | case ISD::SRL: |
| 5466 | if (matchBitExtract(Node)) |
| 5467 | return; |
| 5468 | [[fallthrough]]; |
| 5469 | case ISD::SRA: |
| 5470 | case ISD::SHL: |
| 5471 | if (tryShiftAmountMod(N: Node)) |
| 5472 | return; |
| 5473 | break; |
| 5474 | |
| 5475 | case X86ISD::VPTERNLOG: { |
| 5476 | uint8_t Imm = Node->getConstantOperandVal(Num: 3); |
| 5477 | if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0), |
| 5478 | B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm)) |
| 5479 | return; |
| 5480 | break; |
| 5481 | } |
| 5482 | |
| 5483 | case X86ISD::ANDNP: |
| 5484 | if (tryVPTERNLOG(N: Node)) |
| 5485 | return; |
| 5486 | break; |
| 5487 | |
| 5488 | case ISD::AND: |
| 5489 | if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { |
| 5490 | // Try to form a masked VPTESTM. Operands can be in either order. |
| 5491 | SDValue N0 = Node->getOperand(Num: 0); |
| 5492 | SDValue N1 = Node->getOperand(Num: 1); |
| 5493 | if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && |
| 5494 | tryVPTESTM(Root: Node, Setcc: N0, InMask: N1)) |
| 5495 | return; |
| 5496 | if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && |
| 5497 | tryVPTESTM(Root: Node, Setcc: N1, InMask: N0)) |
| 5498 | return; |
| 5499 | } |
| 5500 | |
| 5501 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { |
| 5502 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
| 5503 | CurDAG->RemoveDeadNode(N: Node); |
| 5504 | return; |
| 5505 | } |
| 5506 | if (matchBitExtract(Node)) |
| 5507 | return; |
| 5508 | if (AndImmShrink && shrinkAndImmediate(And: Node)) |
| 5509 | return; |
| 5510 | |
| 5511 | [[fallthrough]]; |
| 5512 | case ISD::OR: |
| 5513 | case ISD::XOR: |
| 5514 | if (tryShrinkShlLogicImm(N: Node)) |
| 5515 | return; |
| 5516 | if (Opcode == ISD::OR && tryMatchBitSelect(N: Node)) |
| 5517 | return; |
| 5518 | if (tryVPTERNLOG(N: Node)) |
| 5519 | return; |
| 5520 | |
| 5521 | [[fallthrough]]; |
| 5522 | case ISD::ADD: |
| 5523 | if (Opcode == ISD::ADD && matchBitExtract(Node)) |
| 5524 | return; |
| 5525 | [[fallthrough]]; |
| 5526 | case ISD::SUB: { |
| 5527 | // Try to avoid folding immediates with multiple uses for optsize. |
| 5528 | // This code tries to select to register form directly to avoid going |
| 5529 | // through the isel table which might fold the immediate. We can't change |
| 5530 | // the patterns on the add/sub/and/or/xor with immediate paterns in the |
| 5531 | // tablegen files to check immediate use count without making the patterns |
| 5532 | // unavailable to the fast-isel table. |
| 5533 | if (!CurDAG->shouldOptForSize()) |
| 5534 | break; |
| 5535 | |
| 5536 | // Only handle i8/i16/i32/i64. |
| 5537 | if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) |
| 5538 | break; |
| 5539 | |
| 5540 | SDValue N0 = Node->getOperand(Num: 0); |
| 5541 | SDValue N1 = Node->getOperand(Num: 1); |
| 5542 | |
| 5543 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
| 5544 | if (!Cst) |
| 5545 | break; |
| 5546 | |
| 5547 | int64_t Val = Cst->getSExtValue(); |
| 5548 | |
| 5549 | // Make sure its an immediate that is considered foldable. |
| 5550 | // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. |
| 5551 | if (!isInt<8>(x: Val) && !isInt<32>(x: Val)) |
| 5552 | break; |
| 5553 | |
| 5554 | // If this can match to INC/DEC, let it go. |
| 5555 | if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) |
| 5556 | break; |
| 5557 | |
| 5558 | // Check if we should avoid folding this immediate. |
| 5559 | if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode())) |
| 5560 | break; |
| 5561 | |
| 5562 | // We should not fold the immediate. So we need a register form instead. |
| 5563 | unsigned ROpc, MOpc; |
| 5564 | switch (NVT.SimpleTy) { |
| 5565 | default: llvm_unreachable("Unexpected VT!" ); |
| 5566 | case MVT::i8: |
| 5567 | switch (Opcode) { |
| 5568 | default: llvm_unreachable("Unexpected opcode!" ); |
| 5569 | case ISD::ADD: |
| 5570 | ROpc = GET_ND_IF_ENABLED(X86::ADD8rr); |
| 5571 | MOpc = GET_ND_IF_ENABLED(X86::ADD8rm); |
| 5572 | break; |
| 5573 | case ISD::SUB: |
| 5574 | ROpc = GET_ND_IF_ENABLED(X86::SUB8rr); |
| 5575 | MOpc = GET_ND_IF_ENABLED(X86::SUB8rm); |
| 5576 | break; |
| 5577 | case ISD::AND: |
| 5578 | ROpc = GET_ND_IF_ENABLED(X86::AND8rr); |
| 5579 | MOpc = GET_ND_IF_ENABLED(X86::AND8rm); |
| 5580 | break; |
| 5581 | case ISD::OR: |
| 5582 | ROpc = GET_ND_IF_ENABLED(X86::OR8rr); |
| 5583 | MOpc = GET_ND_IF_ENABLED(X86::OR8rm); |
| 5584 | break; |
| 5585 | case ISD::XOR: |
| 5586 | ROpc = GET_ND_IF_ENABLED(X86::XOR8rr); |
| 5587 | MOpc = GET_ND_IF_ENABLED(X86::XOR8rm); |
| 5588 | break; |
| 5589 | } |
| 5590 | break; |
| 5591 | case MVT::i16: |
| 5592 | switch (Opcode) { |
| 5593 | default: llvm_unreachable("Unexpected opcode!" ); |
| 5594 | case ISD::ADD: |
| 5595 | ROpc = GET_ND_IF_ENABLED(X86::ADD16rr); |
| 5596 | MOpc = GET_ND_IF_ENABLED(X86::ADD16rm); |
| 5597 | break; |
| 5598 | case ISD::SUB: |
| 5599 | ROpc = GET_ND_IF_ENABLED(X86::SUB16rr); |
| 5600 | MOpc = GET_ND_IF_ENABLED(X86::SUB16rm); |
| 5601 | break; |
| 5602 | case ISD::AND: |
| 5603 | ROpc = GET_ND_IF_ENABLED(X86::AND16rr); |
| 5604 | MOpc = GET_ND_IF_ENABLED(X86::AND16rm); |
| 5605 | break; |
| 5606 | case ISD::OR: |
| 5607 | ROpc = GET_ND_IF_ENABLED(X86::OR16rr); |
| 5608 | MOpc = GET_ND_IF_ENABLED(X86::OR16rm); |
| 5609 | break; |
| 5610 | case ISD::XOR: |
| 5611 | ROpc = GET_ND_IF_ENABLED(X86::XOR16rr); |
| 5612 | MOpc = GET_ND_IF_ENABLED(X86::XOR16rm); |
| 5613 | break; |
| 5614 | } |
| 5615 | break; |
| 5616 | case MVT::i32: |
| 5617 | switch (Opcode) { |
| 5618 | default: llvm_unreachable("Unexpected opcode!" ); |
| 5619 | case ISD::ADD: |
| 5620 | ROpc = GET_ND_IF_ENABLED(X86::ADD32rr); |
| 5621 | MOpc = GET_ND_IF_ENABLED(X86::ADD32rm); |
| 5622 | break; |
| 5623 | case ISD::SUB: |
| 5624 | ROpc = GET_ND_IF_ENABLED(X86::SUB32rr); |
| 5625 | MOpc = GET_ND_IF_ENABLED(X86::SUB32rm); |
| 5626 | break; |
| 5627 | case ISD::AND: |
| 5628 | ROpc = GET_ND_IF_ENABLED(X86::AND32rr); |
| 5629 | MOpc = GET_ND_IF_ENABLED(X86::AND32rm); |
| 5630 | break; |
| 5631 | case ISD::OR: |
| 5632 | ROpc = GET_ND_IF_ENABLED(X86::OR32rr); |
| 5633 | MOpc = GET_ND_IF_ENABLED(X86::OR32rm); |
| 5634 | break; |
| 5635 | case ISD::XOR: |
| 5636 | ROpc = GET_ND_IF_ENABLED(X86::XOR32rr); |
| 5637 | MOpc = GET_ND_IF_ENABLED(X86::XOR32rm); |
| 5638 | break; |
| 5639 | } |
| 5640 | break; |
| 5641 | case MVT::i64: |
| 5642 | switch (Opcode) { |
| 5643 | default: llvm_unreachable("Unexpected opcode!" ); |
| 5644 | case ISD::ADD: |
| 5645 | ROpc = GET_ND_IF_ENABLED(X86::ADD64rr); |
| 5646 | MOpc = GET_ND_IF_ENABLED(X86::ADD64rm); |
| 5647 | break; |
| 5648 | case ISD::SUB: |
| 5649 | ROpc = GET_ND_IF_ENABLED(X86::SUB64rr); |
| 5650 | MOpc = GET_ND_IF_ENABLED(X86::SUB64rm); |
| 5651 | break; |
| 5652 | case ISD::AND: |
| 5653 | ROpc = GET_ND_IF_ENABLED(X86::AND64rr); |
| 5654 | MOpc = GET_ND_IF_ENABLED(X86::AND64rm); |
| 5655 | break; |
| 5656 | case ISD::OR: |
| 5657 | ROpc = GET_ND_IF_ENABLED(X86::OR64rr); |
| 5658 | MOpc = GET_ND_IF_ENABLED(X86::OR64rm); |
| 5659 | break; |
| 5660 | case ISD::XOR: |
| 5661 | ROpc = GET_ND_IF_ENABLED(X86::XOR64rr); |
| 5662 | MOpc = GET_ND_IF_ENABLED(X86::XOR64rm); |
| 5663 | break; |
| 5664 | } |
| 5665 | break; |
| 5666 | } |
| 5667 | |
| 5668 | // Ok this is a AND/OR/XOR/ADD/SUB with constant. |
| 5669 | |
| 5670 | // If this is a not a subtract, we can still try to fold a load. |
| 5671 | if (Opcode != ISD::SUB) { |
| 5672 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 5673 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
| 5674 | SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
| 5675 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
| 5676 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 5677 | // Update the chain. |
| 5678 | ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2)); |
| 5679 | // Record the mem-refs |
| 5680 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
| 5681 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
| 5682 | CurDAG->RemoveDeadNode(N: Node); |
| 5683 | return; |
| 5684 | } |
| 5685 | } |
| 5686 | |
| 5687 | CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1); |
| 5688 | return; |
| 5689 | } |
| 5690 | |
| 5691 | case X86ISD::SMUL: |
| 5692 | // i16/i32/i64 are handled with isel patterns. |
| 5693 | if (NVT != MVT::i8) |
| 5694 | break; |
| 5695 | [[fallthrough]]; |
| 5696 | case X86ISD::UMUL: { |
| 5697 | SDValue N0 = Node->getOperand(Num: 0); |
| 5698 | SDValue N1 = Node->getOperand(Num: 1); |
| 5699 | |
| 5700 | unsigned LoReg, ROpc, MOpc; |
| 5701 | switch (NVT.SimpleTy) { |
| 5702 | default: llvm_unreachable("Unsupported VT!" ); |
| 5703 | case MVT::i8: |
| 5704 | LoReg = X86::AL; |
| 5705 | ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; |
| 5706 | MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; |
| 5707 | break; |
| 5708 | case MVT::i16: |
| 5709 | LoReg = X86::AX; |
| 5710 | ROpc = X86::MUL16r; |
| 5711 | MOpc = X86::MUL16m; |
| 5712 | break; |
| 5713 | case MVT::i32: |
| 5714 | LoReg = X86::EAX; |
| 5715 | ROpc = X86::MUL32r; |
| 5716 | MOpc = X86::MUL32m; |
| 5717 | break; |
| 5718 | case MVT::i64: |
| 5719 | LoReg = X86::RAX; |
| 5720 | ROpc = X86::MUL64r; |
| 5721 | MOpc = X86::MUL64m; |
| 5722 | break; |
| 5723 | } |
| 5724 | |
| 5725 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 5726 | bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
| 5727 | // Multiply is commutative. |
| 5728 | if (!FoldedLoad) { |
| 5729 | FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
| 5730 | if (FoldedLoad) |
| 5731 | std::swap(a&: N0, b&: N1); |
| 5732 | } |
| 5733 | |
| 5734 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
| 5735 | N: N0, Glue: SDValue()).getValue(R: 1); |
| 5736 | |
| 5737 | MachineSDNode *CNode; |
| 5738 | if (FoldedLoad) { |
| 5739 | // i16/i32/i64 use an instruction that produces a low and high result even |
| 5740 | // though only the low result is used. |
| 5741 | SDVTList VTs; |
| 5742 | if (NVT == MVT::i8) |
| 5743 | VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
| 5744 | else |
| 5745 | VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other); |
| 5746 | |
| 5747 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
| 5748 | InGlue }; |
| 5749 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 5750 | |
| 5751 | // Update the chain. |
| 5752 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); |
| 5753 | // Record the mem-refs |
| 5754 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
| 5755 | } else { |
| 5756 | // i16/i32/i64 use an instruction that produces a low and high result even |
| 5757 | // though only the low result is used. |
| 5758 | SDVTList VTs; |
| 5759 | if (NVT == MVT::i8) |
| 5760 | VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32); |
| 5761 | else |
| 5762 | VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32); |
| 5763 | |
| 5764 | CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue}); |
| 5765 | } |
| 5766 | |
| 5767 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
| 5768 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); |
| 5769 | CurDAG->RemoveDeadNode(N: Node); |
| 5770 | return; |
| 5771 | } |
| 5772 | |
| 5773 | case ISD::SMUL_LOHI: |
| 5774 | case ISD::UMUL_LOHI: { |
| 5775 | SDValue N0 = Node->getOperand(Num: 0); |
| 5776 | SDValue N1 = Node->getOperand(Num: 1); |
| 5777 | |
| 5778 | unsigned Opc, MOpc; |
| 5779 | unsigned LoReg, HiReg; |
| 5780 | bool IsSigned = Opcode == ISD::SMUL_LOHI; |
| 5781 | bool UseMULX = !IsSigned && Subtarget->hasBMI2(); |
| 5782 | bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); |
| 5783 | switch (NVT.SimpleTy) { |
| 5784 | default: llvm_unreachable("Unsupported VT!" ); |
| 5785 | case MVT::i32: |
| 5786 | Opc = UseMULXHi ? X86::MULX32Hrr |
| 5787 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr) |
| 5788 | : IsSigned ? X86::IMUL32r |
| 5789 | : X86::MUL32r; |
| 5790 | MOpc = UseMULXHi ? X86::MULX32Hrm |
| 5791 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm) |
| 5792 | : IsSigned ? X86::IMUL32m |
| 5793 | : X86::MUL32m; |
| 5794 | LoReg = UseMULX ? X86::EDX : X86::EAX; |
| 5795 | HiReg = X86::EDX; |
| 5796 | break; |
| 5797 | case MVT::i64: |
| 5798 | Opc = UseMULXHi ? X86::MULX64Hrr |
| 5799 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr) |
| 5800 | : IsSigned ? X86::IMUL64r |
| 5801 | : X86::MUL64r; |
| 5802 | MOpc = UseMULXHi ? X86::MULX64Hrm |
| 5803 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm) |
| 5804 | : IsSigned ? X86::IMUL64m |
| 5805 | : X86::MUL64m; |
| 5806 | LoReg = UseMULX ? X86::RDX : X86::RAX; |
| 5807 | HiReg = X86::RDX; |
| 5808 | break; |
| 5809 | } |
| 5810 | |
| 5811 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 5812 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
| 5813 | // Multiply is commutative. |
| 5814 | if (!foldedLoad) { |
| 5815 | foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
| 5816 | if (foldedLoad) |
| 5817 | std::swap(a&: N0, b&: N1); |
| 5818 | } |
| 5819 | |
| 5820 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
| 5821 | N: N0, Glue: SDValue()).getValue(R: 1); |
| 5822 | SDValue ResHi, ResLo; |
| 5823 | if (foldedLoad) { |
| 5824 | SDValue Chain; |
| 5825 | MachineSDNode *CNode = nullptr; |
| 5826 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
| 5827 | InGlue }; |
| 5828 | if (UseMULXHi) { |
| 5829 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other); |
| 5830 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 5831 | ResHi = SDValue(CNode, 0); |
| 5832 | Chain = SDValue(CNode, 1); |
| 5833 | } else if (UseMULX) { |
| 5834 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other); |
| 5835 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 5836 | ResHi = SDValue(CNode, 0); |
| 5837 | ResLo = SDValue(CNode, 1); |
| 5838 | Chain = SDValue(CNode, 2); |
| 5839 | } else { |
| 5840 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
| 5841 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
| 5842 | Chain = SDValue(CNode, 0); |
| 5843 | InGlue = SDValue(CNode, 1); |
| 5844 | } |
| 5845 | |
| 5846 | // Update the chain. |
| 5847 | ReplaceUses(F: N1.getValue(R: 1), T: Chain); |
| 5848 | // Record the mem-refs |
| 5849 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
| 5850 | } else { |
| 5851 | SDValue Ops[] = { N1, InGlue }; |
| 5852 | if (UseMULXHi) { |
| 5853 | SDVTList VTs = CurDAG->getVTList(VT: NVT); |
| 5854 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
| 5855 | ResHi = SDValue(CNode, 0); |
| 5856 | } else if (UseMULX) { |
| 5857 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT); |
| 5858 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
| 5859 | ResHi = SDValue(CNode, 0); |
| 5860 | ResLo = SDValue(CNode, 1); |
| 5861 | } else { |
| 5862 | SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue); |
| 5863 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
| 5864 | InGlue = SDValue(CNode, 0); |
| 5865 | } |
| 5866 | } |
| 5867 | |
| 5868 | // Copy the low half of the result, if it is needed. |
| 5869 | if (!SDValue(Node, 0).use_empty()) { |
| 5870 | if (!ResLo) { |
| 5871 | assert(LoReg && "Register for low half is not defined!" ); |
| 5872 | ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
| 5873 | VT: NVT, Glue: InGlue); |
| 5874 | InGlue = ResLo.getValue(R: 2); |
| 5875 | } |
| 5876 | ReplaceUses(F: SDValue(Node, 0), T: ResLo); |
| 5877 | LLVM_DEBUG(dbgs() << "=> " ; ResLo.getNode()->dump(CurDAG); |
| 5878 | dbgs() << '\n'); |
| 5879 | } |
| 5880 | // Copy the high half of the result, if it is needed. |
| 5881 | if (!SDValue(Node, 1).use_empty()) { |
| 5882 | if (!ResHi) { |
| 5883 | assert(HiReg && "Register for high half is not defined!" ); |
| 5884 | ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg, |
| 5885 | VT: NVT, Glue: InGlue); |
| 5886 | InGlue = ResHi.getValue(R: 2); |
| 5887 | } |
| 5888 | ReplaceUses(F: SDValue(Node, 1), T: ResHi); |
| 5889 | LLVM_DEBUG(dbgs() << "=> " ; ResHi.getNode()->dump(CurDAG); |
| 5890 | dbgs() << '\n'); |
| 5891 | } |
| 5892 | |
| 5893 | CurDAG->RemoveDeadNode(N: Node); |
| 5894 | return; |
| 5895 | } |
| 5896 | |
| 5897 | case ISD::SDIVREM: |
| 5898 | case ISD::UDIVREM: { |
| 5899 | SDValue N0 = Node->getOperand(Num: 0); |
| 5900 | SDValue N1 = Node->getOperand(Num: 1); |
| 5901 | |
| 5902 | unsigned ROpc, MOpc; |
| 5903 | bool isSigned = Opcode == ISD::SDIVREM; |
| 5904 | if (!isSigned) { |
| 5905 | switch (NVT.SimpleTy) { |
| 5906 | default: llvm_unreachable("Unsupported VT!" ); |
| 5907 | case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; |
| 5908 | case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; |
| 5909 | case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; |
| 5910 | case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; |
| 5911 | } |
| 5912 | } else { |
| 5913 | switch (NVT.SimpleTy) { |
| 5914 | default: llvm_unreachable("Unsupported VT!" ); |
| 5915 | case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; |
| 5916 | case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; |
| 5917 | case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; |
| 5918 | case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; |
| 5919 | } |
| 5920 | } |
| 5921 | |
| 5922 | unsigned LoReg, HiReg, ClrReg; |
| 5923 | unsigned SExtOpcode; |
| 5924 | switch (NVT.SimpleTy) { |
| 5925 | default: llvm_unreachable("Unsupported VT!" ); |
| 5926 | case MVT::i8: |
| 5927 | LoReg = X86::AL; ClrReg = HiReg = X86::AH; |
| 5928 | SExtOpcode = 0; // Not used. |
| 5929 | break; |
| 5930 | case MVT::i16: |
| 5931 | LoReg = X86::AX; HiReg = X86::DX; |
| 5932 | ClrReg = X86::DX; |
| 5933 | SExtOpcode = X86::CWD; |
| 5934 | break; |
| 5935 | case MVT::i32: |
| 5936 | LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; |
| 5937 | SExtOpcode = X86::CDQ; |
| 5938 | break; |
| 5939 | case MVT::i64: |
| 5940 | LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; |
| 5941 | SExtOpcode = X86::CQO; |
| 5942 | break; |
| 5943 | } |
| 5944 | |
| 5945 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 5946 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
| 5947 | bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0); |
| 5948 | |
| 5949 | SDValue InGlue; |
| 5950 | if (NVT == MVT::i8) { |
| 5951 | // Special case for div8, just use a move with zero extension to AX to |
| 5952 | // clear the upper 8 bits (AH). |
| 5953 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; |
| 5954 | MachineSDNode *Move; |
| 5955 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
| 5956 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
| 5957 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 |
| 5958 | : X86::MOVZX16rm8; |
| 5959 | Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops); |
| 5960 | Chain = SDValue(Move, 1); |
| 5961 | ReplaceUses(F: N0.getValue(R: 1), T: Chain); |
| 5962 | // Record the mem-refs |
| 5963 | CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
| 5964 | } else { |
| 5965 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 |
| 5966 | : X86::MOVZX16rr8; |
| 5967 | Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0); |
| 5968 | Chain = CurDAG->getEntryNode(); |
| 5969 | } |
| 5970 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0), |
| 5971 | Glue: SDValue()); |
| 5972 | InGlue = Chain.getValue(R: 1); |
| 5973 | } else { |
| 5974 | InGlue = |
| 5975 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, |
| 5976 | Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1); |
| 5977 | if (isSigned && !signBitIsZero) { |
| 5978 | // Sign extend the low part into the high part. |
| 5979 | InGlue = |
| 5980 | SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0); |
| 5981 | } else { |
| 5982 | // Zero out the high part, effectively zero extending the input. |
| 5983 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32); |
| 5984 | SDValue ClrNode = |
| 5985 | SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0); |
| 5986 | switch (NVT.SimpleTy) { |
| 5987 | case MVT::i16: |
| 5988 | ClrNode = |
| 5989 | SDValue(CurDAG->getMachineNode( |
| 5990 | Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode, |
| 5991 | Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl, |
| 5992 | VT: MVT::i32)), |
| 5993 | 0); |
| 5994 | break; |
| 5995 | case MVT::i32: |
| 5996 | break; |
| 5997 | case MVT::i64: |
| 5998 | ClrNode = |
| 5999 | SDValue(CurDAG->getMachineNode( |
| 6000 | Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, |
| 6001 | Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: ClrNode, |
| 6002 | Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, |
| 6003 | VT: MVT::i32)), |
| 6004 | 0); |
| 6005 | break; |
| 6006 | default: |
| 6007 | llvm_unreachable("Unexpected division source" ); |
| 6008 | } |
| 6009 | |
| 6010 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg, |
| 6011 | N: ClrNode, Glue: InGlue).getValue(R: 1); |
| 6012 | } |
| 6013 | } |
| 6014 | |
| 6015 | if (foldedLoad) { |
| 6016 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
| 6017 | InGlue }; |
| 6018 | MachineSDNode *CNode = |
| 6019 | CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops); |
| 6020 | InGlue = SDValue(CNode, 1); |
| 6021 | // Update the chain. |
| 6022 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0)); |
| 6023 | // Record the mem-refs |
| 6024 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
| 6025 | } else { |
| 6026 | InGlue = |
| 6027 | SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0); |
| 6028 | } |
| 6029 | |
| 6030 | // Prevent use of AH in a REX instruction by explicitly copying it to |
| 6031 | // an ABCD_L register. |
| 6032 | // |
| 6033 | // The current assumption of the register allocator is that isel |
| 6034 | // won't generate explicit references to the GR8_ABCD_H registers. If |
| 6035 | // the allocator and/or the backend get enhanced to be more robust in |
| 6036 | // that regard, this can be, and should be, removed. |
| 6037 | if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { |
| 6038 | SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8); |
| 6039 | unsigned AHExtOpcode = |
| 6040 | isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; |
| 6041 | |
| 6042 | SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32, |
| 6043 | VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue); |
| 6044 | SDValue Result(RNode, 0); |
| 6045 | InGlue = SDValue(RNode, 1); |
| 6046 | |
| 6047 | Result = |
| 6048 | CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result); |
| 6049 | |
| 6050 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
| 6051 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
| 6052 | dbgs() << '\n'); |
| 6053 | } |
| 6054 | // Copy the division (low) result, if it is needed. |
| 6055 | if (!SDValue(Node, 0).use_empty()) { |
| 6056 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
| 6057 | Reg: LoReg, VT: NVT, Glue: InGlue); |
| 6058 | InGlue = Result.getValue(R: 2); |
| 6059 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
| 6060 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
| 6061 | dbgs() << '\n'); |
| 6062 | } |
| 6063 | // Copy the remainder (high) result, if it is needed. |
| 6064 | if (!SDValue(Node, 1).use_empty()) { |
| 6065 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
| 6066 | Reg: HiReg, VT: NVT, Glue: InGlue); |
| 6067 | InGlue = Result.getValue(R: 2); |
| 6068 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
| 6069 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
| 6070 | dbgs() << '\n'); |
| 6071 | } |
| 6072 | CurDAG->RemoveDeadNode(N: Node); |
| 6073 | return; |
| 6074 | } |
| 6075 | |
| 6076 | case X86ISD::FCMP: |
| 6077 | case X86ISD::STRICT_FCMP: |
| 6078 | case X86ISD::STRICT_FCMPS: { |
| 6079 | bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || |
| 6080 | Node->getOpcode() == X86ISD::STRICT_FCMPS; |
| 6081 | SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0); |
| 6082 | SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1); |
| 6083 | |
| 6084 | // Save the original VT of the compare. |
| 6085 | MVT CmpVT = N0.getSimpleValueType(); |
| 6086 | |
| 6087 | // Floating point needs special handling if we don't have FCOMI. |
| 6088 | if (Subtarget->canUseCMOV()) |
| 6089 | break; |
| 6090 | |
| 6091 | bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; |
| 6092 | |
| 6093 | unsigned Opc; |
| 6094 | switch (CmpVT.SimpleTy) { |
| 6095 | default: llvm_unreachable("Unexpected type!" ); |
| 6096 | case MVT::f32: |
| 6097 | Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; |
| 6098 | break; |
| 6099 | case MVT::f64: |
| 6100 | Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; |
| 6101 | break; |
| 6102 | case MVT::f80: |
| 6103 | Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; |
| 6104 | break; |
| 6105 | } |
| 6106 | |
| 6107 | SDValue Chain = |
| 6108 | IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode(); |
| 6109 | SDValue Glue; |
| 6110 | if (IsStrictCmp) { |
| 6111 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
| 6112 | Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0); |
| 6113 | Glue = Chain.getValue(R: 1); |
| 6114 | } else { |
| 6115 | Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0); |
| 6116 | } |
| 6117 | |
| 6118 | // Move FPSW to AX. |
| 6119 | SDValue FNSTSW = |
| 6120 | SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0); |
| 6121 | |
| 6122 | // Extract upper 8-bits of AX. |
| 6123 | SDValue = |
| 6124 | CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW); |
| 6125 | |
| 6126 | // Move AH into flags. |
| 6127 | // Some 64-bit targets lack SAHF support, but they do support FCOMI. |
| 6128 | assert(Subtarget->canUseLAHFSAHF() && |
| 6129 | "Target doesn't support SAHF or FCOMI?" ); |
| 6130 | SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue()); |
| 6131 | Chain = AH; |
| 6132 | SDValue SAHF = SDValue( |
| 6133 | CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0); |
| 6134 | |
| 6135 | if (IsStrictCmp) |
| 6136 | ReplaceUses(F: SDValue(Node, 1), T: Chain); |
| 6137 | |
| 6138 | ReplaceUses(F: SDValue(Node, 0), T: SAHF); |
| 6139 | CurDAG->RemoveDeadNode(N: Node); |
| 6140 | return; |
| 6141 | } |
| 6142 | |
| 6143 | case X86ISD::CMP: { |
| 6144 | SDValue N0 = Node->getOperand(Num: 0); |
| 6145 | SDValue N1 = Node->getOperand(Num: 1); |
| 6146 | |
| 6147 | // Optimizations for TEST compares. |
| 6148 | if (!isNullConstant(V: N1)) |
| 6149 | break; |
| 6150 | |
| 6151 | // Save the original VT of the compare. |
| 6152 | MVT CmpVT = N0.getSimpleValueType(); |
| 6153 | |
| 6154 | // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed |
| 6155 | // by a test instruction. The test should be removed later by |
| 6156 | // analyzeCompare if we are using only the zero flag. |
| 6157 | // TODO: Should we check the users and use the BEXTR flags directly? |
| 6158 | if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { |
| 6159 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) { |
| 6160 | unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr |
| 6161 | : X86::TEST32rr; |
| 6162 | SDValue BEXTR = SDValue(NewNode, 0); |
| 6163 | NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR); |
| 6164 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
| 6165 | CurDAG->RemoveDeadNode(N: Node); |
| 6166 | return; |
| 6167 | } |
| 6168 | } |
| 6169 | |
| 6170 | // We can peek through truncates, but we need to be careful below. |
| 6171 | if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) |
| 6172 | N0 = N0.getOperand(i: 0); |
| 6173 | |
| 6174 | // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to |
| 6175 | // use a smaller encoding. |
| 6176 | // Look past the truncate if CMP is the only use of it. |
| 6177 | if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && |
| 6178 | N0.getValueType() != MVT::i8) { |
| 6179 | auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
| 6180 | if (!MaskC) |
| 6181 | break; |
| 6182 | |
| 6183 | // We may have looked through a truncate so mask off any bits that |
| 6184 | // shouldn't be part of the compare. |
| 6185 | uint64_t Mask = MaskC->getZExtValue(); |
| 6186 | Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits()); |
| 6187 | |
| 6188 | // Check if we can replace AND+IMM{32,64} with a shift. This is possible |
| 6189 | // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the |
| 6190 | // zero flag. |
| 6191 | if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) && |
| 6192 | onlyUsesZeroFlag(Flags: SDValue(Node, 0))) { |
| 6193 | unsigned ShiftOpcode = ISD::DELETED_NODE; |
| 6194 | unsigned ShiftAmt; |
| 6195 | unsigned SubRegIdx; |
| 6196 | MVT SubRegVT; |
| 6197 | unsigned TestOpcode; |
| 6198 | unsigned LeadingZeros = llvm::countl_zero(Val: Mask); |
| 6199 | unsigned TrailingZeros = llvm::countr_zero(Val: Mask); |
| 6200 | |
| 6201 | // With leading/trailing zeros, the transform is profitable if we can |
| 6202 | // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without |
| 6203 | // incurring any extra register moves. |
| 6204 | bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse(); |
| 6205 | if (LeadingZeros == 0 && SavesBytes) { |
| 6206 | // If the mask covers the most significant bit, then we can replace |
| 6207 | // TEST+AND with a SHR and check eflags. |
| 6208 | // This emits a redundant TEST which is subsequently eliminated. |
| 6209 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
| 6210 | ShiftAmt = TrailingZeros; |
| 6211 | SubRegIdx = 0; |
| 6212 | TestOpcode = X86::TEST64rr; |
| 6213 | } else if (TrailingZeros == 0 && SavesBytes) { |
| 6214 | // If the mask covers the least significant bit, then we can replace |
| 6215 | // TEST+AND with a SHL and check eflags. |
| 6216 | // This emits a redundant TEST which is subsequently eliminated. |
| 6217 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri); |
| 6218 | ShiftAmt = LeadingZeros; |
| 6219 | SubRegIdx = 0; |
| 6220 | TestOpcode = X86::TEST64rr; |
| 6221 | } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) { |
| 6222 | // If the shifted mask extends into the high half and is 8/16/32 bits |
| 6223 | // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr. |
| 6224 | unsigned PopCount = 64 - LeadingZeros - TrailingZeros; |
| 6225 | if (PopCount == 8) { |
| 6226 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
| 6227 | ShiftAmt = TrailingZeros; |
| 6228 | SubRegIdx = X86::sub_8bit; |
| 6229 | SubRegVT = MVT::i8; |
| 6230 | TestOpcode = X86::TEST8rr; |
| 6231 | } else if (PopCount == 16) { |
| 6232 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
| 6233 | ShiftAmt = TrailingZeros; |
| 6234 | SubRegIdx = X86::sub_16bit; |
| 6235 | SubRegVT = MVT::i16; |
| 6236 | TestOpcode = X86::TEST16rr; |
| 6237 | } else if (PopCount == 32) { |
| 6238 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
| 6239 | ShiftAmt = TrailingZeros; |
| 6240 | SubRegIdx = X86::sub_32bit; |
| 6241 | SubRegVT = MVT::i32; |
| 6242 | TestOpcode = X86::TEST32rr; |
| 6243 | } |
| 6244 | } |
| 6245 | if (ShiftOpcode != ISD::DELETED_NODE) { |
| 6246 | SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64); |
| 6247 | SDValue Shift = SDValue( |
| 6248 | CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32, |
| 6249 | Op1: N0.getOperand(i: 0), Op2: ShiftC), |
| 6250 | 0); |
| 6251 | if (SubRegIdx != 0) { |
| 6252 | Shift = |
| 6253 | CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift); |
| 6254 | } |
| 6255 | MachineSDNode *Test = |
| 6256 | CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift); |
| 6257 | ReplaceNode(F: Node, T: Test); |
| 6258 | return; |
| 6259 | } |
| 6260 | } |
| 6261 | |
| 6262 | MVT VT; |
| 6263 | int SubRegOp; |
| 6264 | unsigned ROpc, MOpc; |
| 6265 | |
| 6266 | // For each of these checks we need to be careful if the sign flag is |
| 6267 | // being used. It is only safe to use the sign flag in two conditions, |
| 6268 | // either the sign bit in the shrunken mask is zero or the final test |
| 6269 | // size is equal to the original compare size. |
| 6270 | |
| 6271 | if (isUInt<8>(x: Mask) && |
| 6272 | (!(Mask & 0x80) || CmpVT == MVT::i8 || |
| 6273 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
| 6274 | // For example, convert "testl %eax, $8" to "testb %al, $8" |
| 6275 | VT = MVT::i8; |
| 6276 | SubRegOp = X86::sub_8bit; |
| 6277 | ROpc = X86::TEST8ri; |
| 6278 | MOpc = X86::TEST8mi; |
| 6279 | } else if (OptForMinSize && isUInt<16>(x: Mask) && |
| 6280 | (!(Mask & 0x8000) || CmpVT == MVT::i16 || |
| 6281 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
| 6282 | // For example, "testl %eax, $32776" to "testw %ax, $32776". |
| 6283 | // NOTE: We only want to form TESTW instructions if optimizing for |
| 6284 | // min size. Otherwise we only save one byte and possibly get a length |
| 6285 | // changing prefix penalty in the decoders. |
| 6286 | VT = MVT::i16; |
| 6287 | SubRegOp = X86::sub_16bit; |
| 6288 | ROpc = X86::TEST16ri; |
| 6289 | MOpc = X86::TEST16mi; |
| 6290 | } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 && |
| 6291 | ((!(Mask & 0x80000000) && |
| 6292 | // Without minsize 16-bit Cmps can get here so we need to |
| 6293 | // be sure we calculate the correct sign flag if needed. |
| 6294 | (CmpVT != MVT::i16 || !(Mask & 0x8000))) || |
| 6295 | CmpVT == MVT::i32 || |
| 6296 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
| 6297 | // For example, "testq %rax, $268468232" to "testl %eax, $268468232". |
| 6298 | // NOTE: We only want to run that transform if N0 is 32 or 64 bits. |
| 6299 | // Otherwize, we find ourselves in a position where we have to do |
| 6300 | // promotion. If previous passes did not promote the and, we assume |
| 6301 | // they had a good reason not to and do not promote here. |
| 6302 | VT = MVT::i32; |
| 6303 | SubRegOp = X86::sub_32bit; |
| 6304 | ROpc = X86::TEST32ri; |
| 6305 | MOpc = X86::TEST32mi; |
| 6306 | } else { |
| 6307 | // No eligible transformation was found. |
| 6308 | break; |
| 6309 | } |
| 6310 | |
| 6311 | SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT); |
| 6312 | SDValue Reg = N0.getOperand(i: 0); |
| 6313 | |
| 6314 | // Emit a testl or testw. |
| 6315 | MachineSDNode *NewNode; |
| 6316 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
| 6317 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
| 6318 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) { |
| 6319 | if (!LoadN->isSimple()) { |
| 6320 | unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits(); |
| 6321 | if ((MOpc == X86::TEST8mi && NumVolBits != 8) || |
| 6322 | (MOpc == X86::TEST16mi && NumVolBits != 16) || |
| 6323 | (MOpc == X86::TEST32mi && NumVolBits != 32)) |
| 6324 | break; |
| 6325 | } |
| 6326 | } |
| 6327 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
| 6328 | Reg.getOperand(i: 0) }; |
| 6329 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops); |
| 6330 | // Update the chain. |
| 6331 | ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1)); |
| 6332 | // Record the mem-refs |
| 6333 | CurDAG->setNodeMemRefs(N: NewNode, |
| 6334 | NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()}); |
| 6335 | } else { |
| 6336 | // Extract the subregister if necessary. |
| 6337 | if (N0.getValueType() != VT) |
| 6338 | Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg); |
| 6339 | |
| 6340 | NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm); |
| 6341 | } |
| 6342 | // Replace CMP with TEST. |
| 6343 | ReplaceNode(F: Node, T: NewNode); |
| 6344 | return; |
| 6345 | } |
| 6346 | break; |
| 6347 | } |
| 6348 | case X86ISD::PCMPISTR: { |
| 6349 | if (!Subtarget->hasSSE42()) |
| 6350 | break; |
| 6351 | |
| 6352 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
| 6353 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
| 6354 | // We can't fold a load if we are going to make two instructions. |
| 6355 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
| 6356 | |
| 6357 | MachineSDNode *CNode; |
| 6358 | if (NeedMask) { |
| 6359 | unsigned ROpc = |
| 6360 | Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri; |
| 6361 | unsigned MOpc = |
| 6362 | Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi; |
| 6363 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node); |
| 6364 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
| 6365 | } |
| 6366 | if (NeedIndex || !NeedMask) { |
| 6367 | unsigned ROpc = |
| 6368 | Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri; |
| 6369 | unsigned MOpc = |
| 6370 | Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi; |
| 6371 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node); |
| 6372 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
| 6373 | } |
| 6374 | |
| 6375 | // Connect the flag usage to the last instruction created. |
| 6376 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
| 6377 | CurDAG->RemoveDeadNode(N: Node); |
| 6378 | return; |
| 6379 | } |
| 6380 | case X86ISD::PCMPESTR: { |
| 6381 | if (!Subtarget->hasSSE42()) |
| 6382 | break; |
| 6383 | |
| 6384 | // Copy the two implicit register inputs. |
| 6385 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX, |
| 6386 | N: Node->getOperand(Num: 1), |
| 6387 | Glue: SDValue()).getValue(R: 1); |
| 6388 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX, |
| 6389 | N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1); |
| 6390 | |
| 6391 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
| 6392 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
| 6393 | // We can't fold a load if we are going to make two instructions. |
| 6394 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
| 6395 | |
| 6396 | MachineSDNode *CNode; |
| 6397 | if (NeedMask) { |
| 6398 | unsigned ROpc = |
| 6399 | Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri; |
| 6400 | unsigned MOpc = |
| 6401 | Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi; |
| 6402 | CNode = |
| 6403 | emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue); |
| 6404 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
| 6405 | } |
| 6406 | if (NeedIndex || !NeedMask) { |
| 6407 | unsigned ROpc = |
| 6408 | Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri; |
| 6409 | unsigned MOpc = |
| 6410 | Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi; |
| 6411 | CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue); |
| 6412 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
| 6413 | } |
| 6414 | // Connect the flag usage to the last instruction created. |
| 6415 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
| 6416 | CurDAG->RemoveDeadNode(N: Node); |
| 6417 | return; |
| 6418 | } |
| 6419 | |
| 6420 | case ISD::SETCC: { |
| 6421 | if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue())) |
| 6422 | return; |
| 6423 | |
| 6424 | break; |
| 6425 | } |
| 6426 | |
| 6427 | case ISD::STORE: |
| 6428 | if (foldLoadStoreIntoMemOperand(Node)) |
| 6429 | return; |
| 6430 | break; |
| 6431 | |
| 6432 | case X86ISD::SETCC_CARRY: { |
| 6433 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
| 6434 | SDValue Result; |
| 6435 | if (Subtarget->hasSBBDepBreaking()) { |
| 6436 | // We have to do this manually because tblgen will put the eflags copy in |
| 6437 | // the wrong place if we use an extract_subreg in the pattern. |
| 6438 | // Copy flags to the EFLAGS register and glue it to next node. |
| 6439 | SDValue EFLAGS = |
| 6440 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS, |
| 6441 | N: Node->getOperand(Num: 1), Glue: SDValue()); |
| 6442 | |
| 6443 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
| 6444 | // 32-bit version. |
| 6445 | unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; |
| 6446 | MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
| 6447 | Result = SDValue( |
| 6448 | CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)), |
| 6449 | 0); |
| 6450 | } else { |
| 6451 | // The target does not recognize sbb with the same reg operand as a |
| 6452 | // no-source idiom, so we explicitly zero the input values. |
| 6453 | Result = getSBBZero(N: Node); |
| 6454 | } |
| 6455 | |
| 6456 | // For less than 32-bits we need to extract from the 32-bit node. |
| 6457 | if (VT == MVT::i8 || VT == MVT::i16) { |
| 6458 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
| 6459 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
| 6460 | } |
| 6461 | |
| 6462 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
| 6463 | CurDAG->RemoveDeadNode(N: Node); |
| 6464 | return; |
| 6465 | } |
| 6466 | case X86ISD::SBB: { |
| 6467 | if (isNullConstant(V: Node->getOperand(Num: 0)) && |
| 6468 | isNullConstant(V: Node->getOperand(Num: 1))) { |
| 6469 | SDValue Result = getSBBZero(N: Node); |
| 6470 | |
| 6471 | // Replace the flag use. |
| 6472 | ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1)); |
| 6473 | |
| 6474 | // Replace the result use. |
| 6475 | if (!SDValue(Node, 0).use_empty()) { |
| 6476 | // For less than 32-bits we need to extract from the 32-bit node. |
| 6477 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
| 6478 | if (VT == MVT::i8 || VT == MVT::i16) { |
| 6479 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
| 6480 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
| 6481 | } |
| 6482 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
| 6483 | } |
| 6484 | |
| 6485 | CurDAG->RemoveDeadNode(N: Node); |
| 6486 | return; |
| 6487 | } |
| 6488 | break; |
| 6489 | } |
| 6490 | case X86ISD::MGATHER: { |
| 6491 | auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node); |
| 6492 | SDValue IndexOp = Mgt->getIndex(); |
| 6493 | SDValue Mask = Mgt->getMask(); |
| 6494 | MVT IndexVT = IndexOp.getSimpleValueType(); |
| 6495 | MVT ValueVT = Node->getSimpleValueType(ResNo: 0); |
| 6496 | MVT MaskVT = Mask.getSimpleValueType(); |
| 6497 | |
| 6498 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
| 6499 | // otherwise only doing loose type checking in here based on type what |
| 6500 | // a type constraint would say just like table based isel. |
| 6501 | if (!ValueVT.isVector() || !MaskVT.isVector()) |
| 6502 | break; |
| 6503 | |
| 6504 | unsigned NumElts = ValueVT.getVectorNumElements(); |
| 6505 | MVT ValueSVT = ValueVT.getVectorElementType(); |
| 6506 | |
| 6507 | bool IsFP = ValueSVT.isFloatingPoint(); |
| 6508 | unsigned EltSize = ValueSVT.getSizeInBits(); |
| 6509 | |
| 6510 | unsigned Opc = 0; |
| 6511 | bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; |
| 6512 | if (AVX512Gather) { |
| 6513 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
| 6514 | Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; |
| 6515 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
| 6516 | Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; |
| 6517 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
| 6518 | Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; |
| 6519 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
| 6520 | Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; |
| 6521 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
| 6522 | Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; |
| 6523 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
| 6524 | Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; |
| 6525 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
| 6526 | Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; |
| 6527 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
| 6528 | Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; |
| 6529 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
| 6530 | Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; |
| 6531 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
| 6532 | Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; |
| 6533 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
| 6534 | Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; |
| 6535 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
| 6536 | Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; |
| 6537 | } else { |
| 6538 | assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && |
| 6539 | "Unexpected mask VT!" ); |
| 6540 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
| 6541 | Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; |
| 6542 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
| 6543 | Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; |
| 6544 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
| 6545 | Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; |
| 6546 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
| 6547 | Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; |
| 6548 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
| 6549 | Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; |
| 6550 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
| 6551 | Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; |
| 6552 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
| 6553 | Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; |
| 6554 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
| 6555 | Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; |
| 6556 | } |
| 6557 | |
| 6558 | if (!Opc) |
| 6559 | break; |
| 6560 | |
| 6561 | SDValue Base, Scale, Index, Disp, Segment; |
| 6562 | if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(), |
| 6563 | Base, Scale, Index, Disp, Segment)) |
| 6564 | break; |
| 6565 | |
| 6566 | SDValue PassThru = Mgt->getPassThru(); |
| 6567 | SDValue Chain = Mgt->getChain(); |
| 6568 | // Gather instructions have a mask output not in the ISD node. |
| 6569 | SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other); |
| 6570 | |
| 6571 | MachineSDNode *NewNode; |
| 6572 | if (AVX512Gather) { |
| 6573 | SDValue Ops[] = {PassThru, Mask, Base, Scale, |
| 6574 | Index, Disp, Segment, Chain}; |
| 6575 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
| 6576 | } else { |
| 6577 | SDValue Ops[] = {PassThru, Base, Scale, Index, |
| 6578 | Disp, Segment, Mask, Chain}; |
| 6579 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
| 6580 | } |
| 6581 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()}); |
| 6582 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
| 6583 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2)); |
| 6584 | CurDAG->RemoveDeadNode(N: Node); |
| 6585 | return; |
| 6586 | } |
| 6587 | case X86ISD::MSCATTER: { |
| 6588 | auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node); |
| 6589 | SDValue Value = Sc->getValue(); |
| 6590 | SDValue IndexOp = Sc->getIndex(); |
| 6591 | MVT IndexVT = IndexOp.getSimpleValueType(); |
| 6592 | MVT ValueVT = Value.getSimpleValueType(); |
| 6593 | |
| 6594 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
| 6595 | // otherwise only doing loose type checking in here based on type what |
| 6596 | // a type constraint would say just like table based isel. |
| 6597 | if (!ValueVT.isVector()) |
| 6598 | break; |
| 6599 | |
| 6600 | unsigned NumElts = ValueVT.getVectorNumElements(); |
| 6601 | MVT ValueSVT = ValueVT.getVectorElementType(); |
| 6602 | |
| 6603 | bool IsFP = ValueSVT.isFloatingPoint(); |
| 6604 | unsigned EltSize = ValueSVT.getSizeInBits(); |
| 6605 | |
| 6606 | unsigned Opc; |
| 6607 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
| 6608 | Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; |
| 6609 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
| 6610 | Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; |
| 6611 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
| 6612 | Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; |
| 6613 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
| 6614 | Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; |
| 6615 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
| 6616 | Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; |
| 6617 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
| 6618 | Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; |
| 6619 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
| 6620 | Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; |
| 6621 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
| 6622 | Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; |
| 6623 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
| 6624 | Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; |
| 6625 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
| 6626 | Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; |
| 6627 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
| 6628 | Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; |
| 6629 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
| 6630 | Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; |
| 6631 | else |
| 6632 | break; |
| 6633 | |
| 6634 | SDValue Base, Scale, Index, Disp, Segment; |
| 6635 | if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(), |
| 6636 | Base, Scale, Index, Disp, Segment)) |
| 6637 | break; |
| 6638 | |
| 6639 | SDValue Mask = Sc->getMask(); |
| 6640 | SDValue Chain = Sc->getChain(); |
| 6641 | // Scatter instructions have a mask output not in the ISD node. |
| 6642 | SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other); |
| 6643 | SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; |
| 6644 | |
| 6645 | MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
| 6646 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()}); |
| 6647 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1)); |
| 6648 | CurDAG->RemoveDeadNode(N: Node); |
| 6649 | return; |
| 6650 | } |
| 6651 | case ISD::PREALLOCATED_SETUP: { |
| 6652 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
| 6653 | auto CallId = MFI->getPreallocatedIdForCallSite( |
| 6654 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
| 6655 | SDValue Chain = Node->getOperand(Num: 0); |
| 6656 | SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32); |
| 6657 | MachineSDNode *New = CurDAG->getMachineNode( |
| 6658 | Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain); |
| 6659 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain |
| 6660 | CurDAG->RemoveDeadNode(N: Node); |
| 6661 | return; |
| 6662 | } |
| 6663 | case ISD::PREALLOCATED_ARG: { |
| 6664 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
| 6665 | auto CallId = MFI->getPreallocatedIdForCallSite( |
| 6666 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
| 6667 | SDValue Chain = Node->getOperand(Num: 0); |
| 6668 | SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32); |
| 6669 | SDValue ArgIndex = Node->getOperand(Num: 2); |
| 6670 | SDValue Ops[3]; |
| 6671 | Ops[0] = CallIdValue; |
| 6672 | Ops[1] = ArgIndex; |
| 6673 | Ops[2] = Chain; |
| 6674 | MachineSDNode *New = CurDAG->getMachineNode( |
| 6675 | Opcode: TargetOpcode::PREALLOCATED_ARG, dl, |
| 6676 | VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()), |
| 6677 | VT2: MVT::Other), |
| 6678 | Ops); |
| 6679 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer |
| 6680 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain |
| 6681 | CurDAG->RemoveDeadNode(N: Node); |
| 6682 | return; |
| 6683 | } |
| 6684 | case X86ISD::AESENCWIDE128KL: |
| 6685 | case X86ISD::AESDECWIDE128KL: |
| 6686 | case X86ISD::AESENCWIDE256KL: |
| 6687 | case X86ISD::AESDECWIDE256KL: { |
| 6688 | if (!Subtarget->hasWIDEKL()) |
| 6689 | break; |
| 6690 | |
| 6691 | unsigned Opcode; |
| 6692 | switch (Node->getOpcode()) { |
| 6693 | default: |
| 6694 | llvm_unreachable("Unexpected opcode!" ); |
| 6695 | case X86ISD::AESENCWIDE128KL: |
| 6696 | Opcode = X86::AESENCWIDE128KL; |
| 6697 | break; |
| 6698 | case X86ISD::AESDECWIDE128KL: |
| 6699 | Opcode = X86::AESDECWIDE128KL; |
| 6700 | break; |
| 6701 | case X86ISD::AESENCWIDE256KL: |
| 6702 | Opcode = X86::AESENCWIDE256KL; |
| 6703 | break; |
| 6704 | case X86ISD::AESDECWIDE256KL: |
| 6705 | Opcode = X86::AESDECWIDE256KL; |
| 6706 | break; |
| 6707 | } |
| 6708 | |
| 6709 | SDValue Chain = Node->getOperand(Num: 0); |
| 6710 | SDValue Addr = Node->getOperand(Num: 1); |
| 6711 | |
| 6712 | SDValue Base, Scale, Index, Disp, Segment; |
| 6713 | if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment)) |
| 6714 | break; |
| 6715 | |
| 6716 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2), |
| 6717 | Glue: SDValue()); |
| 6718 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3), |
| 6719 | Glue: Chain.getValue(R: 1)); |
| 6720 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4), |
| 6721 | Glue: Chain.getValue(R: 1)); |
| 6722 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5), |
| 6723 | Glue: Chain.getValue(R: 1)); |
| 6724 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6), |
| 6725 | Glue: Chain.getValue(R: 1)); |
| 6726 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7), |
| 6727 | Glue: Chain.getValue(R: 1)); |
| 6728 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8), |
| 6729 | Glue: Chain.getValue(R: 1)); |
| 6730 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9), |
| 6731 | Glue: Chain.getValue(R: 1)); |
| 6732 | |
| 6733 | MachineSDNode *Res = CurDAG->getMachineNode( |
| 6734 | Opcode, dl, VTs: Node->getVTList(), |
| 6735 | Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)}); |
| 6736 | CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand()); |
| 6737 | ReplaceNode(F: Node, T: Res); |
| 6738 | return; |
| 6739 | } |
| 6740 | case X86ISD::POP_FROM_X87_REG: { |
| 6741 | SDValue Chain = Node->getOperand(Num: 0); |
| 6742 | Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1))->getReg(); |
| 6743 | SDValue Glue; |
| 6744 | if (Node->getNumValues() == 3) |
| 6745 | Glue = Node->getOperand(Num: 2); |
| 6746 | SDValue Copy = |
| 6747 | CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: 0), Glue); |
| 6748 | ReplaceNode(F: Node, T: Copy.getNode()); |
| 6749 | return; |
| 6750 | } |
| 6751 | } |
| 6752 | |
| 6753 | SelectCode(N: Node); |
| 6754 | } |
| 6755 | |
| 6756 | bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand( |
| 6757 | const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, |
| 6758 | std::vector<SDValue> &OutOps) { |
| 6759 | SDValue Op0, Op1, Op2, Op3, Op4; |
| 6760 | switch (ConstraintID) { |
| 6761 | default: |
| 6762 | llvm_unreachable("Unexpected asm memory constraint" ); |
| 6763 | case InlineAsm::ConstraintCode::o: // offsetable ?? |
| 6764 | case InlineAsm::ConstraintCode::v: // not offsetable ?? |
| 6765 | case InlineAsm::ConstraintCode::m: // memory |
| 6766 | case InlineAsm::ConstraintCode::X: |
| 6767 | case InlineAsm::ConstraintCode::p: // address |
| 6768 | if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4)) |
| 6769 | return true; |
| 6770 | break; |
| 6771 | } |
| 6772 | |
| 6773 | OutOps.push_back(x: Op0); |
| 6774 | OutOps.push_back(x: Op1); |
| 6775 | OutOps.push_back(x: Op2); |
| 6776 | OutOps.push_back(x: Op3); |
| 6777 | OutOps.push_back(x: Op4); |
| 6778 | return false; |
| 6779 | } |
| 6780 | |
| 6781 | X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM) |
| 6782 | : SelectionDAGISelPass( |
| 6783 | std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {} |
| 6784 | |
| 6785 | /// This pass converts a legalized DAG into a X86-specific DAG, |
| 6786 | /// ready for instruction scheduling. |
| 6787 | FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, |
| 6788 | CodeGenOptLevel OptLevel) { |
| 6789 | return new X86DAGToDAGISelLegacy(TM, OptLevel); |
| 6790 | } |
| 6791 | |