1//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the PPCISelLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "PPCISelLowering.h"
14#include "MCTargetDesc/PPCMCTargetDesc.h"
15#include "MCTargetDesc/PPCPredicates.h"
16#include "PPC.h"
17#include "PPCCallingConv.h"
18#include "PPCFrameLowering.h"
19#include "PPCInstrInfo.h"
20#include "PPCMachineFunctionInfo.h"
21#include "PPCPerfectShuffle.h"
22#include "PPCRegisterInfo.h"
23#include "PPCSelectionDAGInfo.h"
24#include "PPCSubtarget.h"
25#include "PPCTargetMachine.h"
26#include "llvm/ADT/APFloat.h"
27#include "llvm/ADT/APInt.h"
28#include "llvm/ADT/APSInt.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SmallPtrSet.h"
33#include "llvm/ADT/SmallVector.h"
34#include "llvm/ADT/Statistic.h"
35#include "llvm/ADT/StringRef.h"
36#include "llvm/CodeGen/CallingConvLower.h"
37#include "llvm/CodeGen/ISDOpcodes.h"
38#include "llvm/CodeGen/LivePhysRegs.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
40#include "llvm/CodeGen/MachineFrameInfo.h"
41#include "llvm/CodeGen/MachineFunction.h"
42#include "llvm/CodeGen/MachineInstr.h"
43#include "llvm/CodeGen/MachineInstrBuilder.h"
44#include "llvm/CodeGen/MachineJumpTableInfo.h"
45#include "llvm/CodeGen/MachineLoopInfo.h"
46#include "llvm/CodeGen/MachineMemOperand.h"
47#include "llvm/CodeGen/MachineModuleInfo.h"
48#include "llvm/CodeGen/MachineOperand.h"
49#include "llvm/CodeGen/MachineRegisterInfo.h"
50#include "llvm/CodeGen/SelectionDAG.h"
51#include "llvm/CodeGen/SelectionDAGNodes.h"
52#include "llvm/CodeGen/TargetInstrInfo.h"
53#include "llvm/CodeGen/TargetLowering.h"
54#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
55#include "llvm/CodeGen/TargetRegisterInfo.h"
56#include "llvm/CodeGen/ValueTypes.h"
57#include "llvm/CodeGenTypes/MachineValueType.h"
58#include "llvm/IR/CallingConv.h"
59#include "llvm/IR/Constant.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GlobalValue.h"
66#include "llvm/IR/IRBuilder.h"
67#include "llvm/IR/Instructions.h"
68#include "llvm/IR/Intrinsics.h"
69#include "llvm/IR/IntrinsicsPowerPC.h"
70#include "llvm/IR/Module.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/Value.h"
74#include "llvm/MC/MCContext.h"
75#include "llvm/MC/MCExpr.h"
76#include "llvm/MC/MCSectionXCOFF.h"
77#include "llvm/MC/MCSymbolXCOFF.h"
78#include "llvm/Support/AtomicOrdering.h"
79#include "llvm/Support/BranchProbability.h"
80#include "llvm/Support/Casting.h"
81#include "llvm/Support/CodeGen.h"
82#include "llvm/Support/CommandLine.h"
83#include "llvm/Support/Compiler.h"
84#include "llvm/Support/Debug.h"
85#include "llvm/Support/ErrorHandling.h"
86#include "llvm/Support/Format.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Target/TargetMachine.h"
91#include "llvm/Target/TargetOptions.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <list>
97#include <optional>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "ppc-lowering"
104
105static cl::opt<bool> DisableP10StoreForward(
106 "disable-p10-store-forward",
107 cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
108 cl::init(Val: false));
109
110static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
111cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
112
113static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
114cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
115
116static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
117cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
118
119static cl::opt<bool> DisableSCO("disable-ppc-sco",
120cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
121
122static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
123cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
124
125static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
126cl::desc("use absolute jump tables on ppc"), cl::Hidden);
127
128static cl::opt<bool>
129 DisablePerfectShuffle("ppc-disable-perfect-shuffle",
130 cl::desc("disable vector permute decomposition"),
131 cl::init(Val: true), cl::Hidden);
132
133cl::opt<bool> DisableAutoPairedVecSt(
134 "disable-auto-paired-vec-st",
135 cl::desc("disable automatically generated 32byte paired vector stores"),
136 cl::init(Val: true), cl::Hidden);
137
138static cl::opt<unsigned> PPCMinimumJumpTableEntries(
139 "ppc-min-jump-table-entries", cl::init(Val: 64), cl::Hidden,
140 cl::desc("Set minimum number of entries to use a jump table on PPC"));
141
142static cl::opt<unsigned> PPCMinimumBitTestCmps(
143 "ppc-min-bit-test-cmps", cl::init(Val: 3), cl::Hidden,
144 cl::desc("Set minimum of largest number of comparisons to use bit test for "
145 "switch on PPC."));
146
147static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
148 "ppc-gather-alias-max-depth", cl::init(Val: 18), cl::Hidden,
149 cl::desc("max depth when checking alias info in GatherAllAliases()"));
150
151static cl::opt<unsigned> PPCAIXTLSModelOptUseIEForLDLimit(
152 "ppc-aix-shared-lib-tls-model-opt-limit", cl::init(Val: 1), cl::Hidden,
153 cl::desc("Set inclusive limit count of TLS local-dynamic access(es) in a "
154 "function to use initial-exec"));
155
156STATISTIC(NumTailCalls, "Number of tail calls");
157STATISTIC(NumSiblingCalls, "Number of sibling calls");
158STATISTIC(ShufflesHandledWithVPERM,
159 "Number of shuffles lowered to a VPERM or XXPERM");
160STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
161
162static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
163
164static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
165
166static void signExtendOperandIfUnknown(MachineInstr &MI, MachineBasicBlock *BB,
167 unsigned OpIdx, bool IsByte,
168 const PPCInstrInfo *TII);
169
170// A faster local-[exec|dynamic] TLS access sequence (enabled with the
171// -maix-small-local-[exec|dynamic]-tls option) can be produced for TLS
172// variables; consistent with the IBM XL compiler, we apply a max size of
173// slightly under 32KB.
174constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;
175
176// FIXME: Remove this once the bug has been fixed!
177extern cl::opt<bool> ANDIGlueBug;
178
179PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
180 const PPCSubtarget &STI)
181 : TargetLowering(TM, STI), Subtarget(STI) {
182 // Initialize map that relates the PPC addressing modes to the computed flags
183 // of a load/store instruction. The map is used to determine the optimal
184 // addressing mode when selecting load and stores.
185 initializeAddrModeMap();
186 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
187 // arguments are at least 4/8 bytes aligned.
188 bool isPPC64 = Subtarget.isPPC64();
189 setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
190 const MVT RegVT = Subtarget.getScalarIntVT();
191
192 // Set up the register classes.
193 addRegisterClass(VT: MVT::i32, RC: &PPC::GPRCRegClass);
194 if (!useSoftFloat()) {
195 if (hasSPE()) {
196 addRegisterClass(VT: MVT::f32, RC: &PPC::GPRCRegClass);
197 // EFPU2 APU only supports f32
198 if (!Subtarget.hasEFPU2())
199 addRegisterClass(VT: MVT::f64, RC: &PPC::SPERCRegClass);
200 } else {
201 addRegisterClass(VT: MVT::f32, RC: &PPC::F4RCRegClass);
202 addRegisterClass(VT: MVT::f64, RC: &PPC::F8RCRegClass);
203 }
204 }
205
206 setOperationAction(Op: ISD::UADDO, VT: RegVT, Action: Custom);
207 setOperationAction(Op: ISD::USUBO, VT: RegVT, Action: Custom);
208
209 // PowerPC uses addo_carry,subo_carry to propagate carry.
210 setOperationAction(Op: ISD::UADDO_CARRY, VT: RegVT, Action: Custom);
211 setOperationAction(Op: ISD::USUBO_CARRY, VT: RegVT, Action: Custom);
212
213 // On P10, the default lowering generates better code using the
214 // setbc instruction.
215 if (!Subtarget.hasP10Vector()) {
216 setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
217 setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
218 if (isPPC64) {
219 setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
220 setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
221 }
222 }
223
224 // Match BITREVERSE to customized fast code sequence in the td file.
225 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
226 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
227
228 // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
229 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: Custom);
230
231 // Custom lower inline assembly to check for special registers.
232 setOperationAction(Op: ISD::INLINEASM, VT: MVT::Other, Action: Custom);
233 setOperationAction(Op: ISD::INLINEASM_BR, VT: MVT::Other, Action: Custom);
234
235 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
236 for (MVT VT : MVT::integer_valuetypes()) {
237 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
238 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i8, Action: Expand);
239 }
240
241 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f16, Action: Expand);
242 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f128, Action: Expand);
243
244 if (Subtarget.isISA3_0()) {
245 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: MVT::f16, Action: Legal);
246 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Legal);
247 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Legal);
248 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Legal);
249 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Legal);
250 } else {
251 // No extending loads from f16 or HW conversions back and forth.
252 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: MVT::f16, Action: Expand);
253 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f128, Action: Expand);
254 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
255 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f64, Action: Expand);
256 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f64, Action: Expand);
257 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
258 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f32, Action: Expand);
259 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f32, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
261 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
262 }
263
264 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
265
266 // PowerPC has pre-inc load and store's.
267 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i1, Action: Legal);
268 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i8, Action: Legal);
269 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i16, Action: Legal);
270 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i32, Action: Legal);
271 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i64, Action: Legal);
272 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i1, Action: Legal);
273 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i8, Action: Legal);
274 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i16, Action: Legal);
275 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i32, Action: Legal);
276 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i64, Action: Legal);
277 if (!Subtarget.hasSPE()) {
278 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::f32, Action: Legal);
279 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::f64, Action: Legal);
280 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::f32, Action: Legal);
281 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::f64, Action: Legal);
282 }
283
284 if (Subtarget.useCRBits()) {
285 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand);
286
287 if (isPPC64 || Subtarget.hasFPCVT()) {
288 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i1, Action: Promote);
289 AddPromotedToType(Opc: ISD::STRICT_SINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
290 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i1, Action: Promote);
291 AddPromotedToType(Opc: ISD::STRICT_UINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
292
293 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i1, Action: Promote);
294 AddPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
295 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i1, Action: Promote);
296 AddPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
297
298 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i1, Action: Promote);
299 AddPromotedToType(Opc: ISD::STRICT_FP_TO_SINT, OrigVT: MVT::i1, DestVT: RegVT);
300 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i1, Action: Promote);
301 AddPromotedToType(Opc: ISD::STRICT_FP_TO_UINT, OrigVT: MVT::i1, DestVT: RegVT);
302
303 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i1, Action: Promote);
304 AddPromotedToType(Opc: ISD::FP_TO_SINT, OrigVT: MVT::i1, DestVT: RegVT);
305 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i1, Action: Promote);
306 AddPromotedToType(Opc: ISD::FP_TO_UINT, OrigVT: MVT::i1, DestVT: RegVT);
307 } else {
308 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i1, Action: Custom);
309 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i1, Action: Custom);
310 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i1, Action: Custom);
311 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i1, Action: Custom);
312 }
313
314 // PowerPC does not support direct load/store of condition registers.
315 setOperationAction(Op: ISD::LOAD, VT: MVT::i1, Action: Custom);
316 setOperationAction(Op: ISD::STORE, VT: MVT::i1, Action: Custom);
317
318 // FIXME: Remove this once the ANDI glue bug is fixed:
319 if (ANDIGlueBug)
320 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::i1, Action: Custom);
321
322 for (MVT VT : MVT::integer_valuetypes()) {
323 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
324 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
325 setTruncStoreAction(ValVT: VT, MemVT: MVT::i1, Action: Expand);
326 }
327
328 addRegisterClass(VT: MVT::i1, RC: &PPC::CRBITRCRegClass);
329 }
330
331 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
332 // PPC (the libcall is not available).
333 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::ppcf128, Action: Custom);
334 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::ppcf128, Action: Custom);
335 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::ppcf128, Action: Custom);
336 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::ppcf128, Action: Custom);
337
338 // We do not currently implement these libm ops for PowerPC.
339 setOperationAction(Op: ISD::FFLOOR, VT: MVT::ppcf128, Action: Expand);
340 setOperationAction(Op: ISD::FCEIL, VT: MVT::ppcf128, Action: Expand);
341 setOperationAction(Op: ISD::FTRUNC, VT: MVT::ppcf128, Action: Expand);
342 setOperationAction(Op: ISD::FRINT, VT: MVT::ppcf128, Action: Expand);
343 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::ppcf128, Action: Expand);
344 setOperationAction(Op: ISD::FREM, VT: MVT::ppcf128, Action: LibCall);
345
346 // PowerPC has no SREM/UREM instructions unless we are on P9
347 // On P9 we may use a hardware instruction to compute the remainder.
348 // When the result of both the remainder and the division is required it is
349 // more efficient to compute the remainder from the result of the division
350 // rather than use the remainder instruction. The instructions are legalized
351 // directly because the DivRemPairsPass performs the transformation at the IR
352 // level.
353 if (Subtarget.isISA3_0()) {
354 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Legal);
355 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Legal);
356 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Legal);
357 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Legal);
358 } else {
359 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
360 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
361 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
362 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
363 }
364
365 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
366 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
367 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
368 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
369 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
370 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
371 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
372 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
373 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
374
375 // Handle constrained floating-point operations of scalar.
376 // TODO: Handle SPE specific operation.
377 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f32, Action: Legal);
378 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f32, Action: Legal);
379 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f32, Action: Legal);
380 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f32, Action: Legal);
381 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Legal);
382
383 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f64, Action: Legal);
384 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f64, Action: Legal);
385 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f64, Action: Legal);
386 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f64, Action: Legal);
387
388 if (!Subtarget.hasSPE()) {
389 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f32, Action: Legal);
390 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f64, Action: Legal);
391 }
392
393 if (Subtarget.hasVSX()) {
394 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f32, Action: Legal);
395 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f64, Action: Legal);
396 }
397
398 if (Subtarget.hasFSQRT()) {
399 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f32, Action: Legal);
400 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f64, Action: Legal);
401 }
402
403 if (Subtarget.hasFPRND()) {
404 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f32, Action: Legal);
405 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f32, Action: Legal);
406 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f32, Action: Legal);
407 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f32, Action: Legal);
408
409 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f64, Action: Legal);
410 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f64, Action: Legal);
411 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f64, Action: Legal);
412 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f64, Action: Legal);
413 }
414
415 // We don't support sin/cos/sqrt/fmod/pow
416 setOperationAction(Op: ISD::FSIN , VT: MVT::f64, Action: Expand);
417 setOperationAction(Op: ISD::FCOS , VT: MVT::f64, Action: Expand);
418 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
419 setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: LibCall);
420 setOperationAction(Op: ISD::FPOW , VT: MVT::f64, Action: Expand);
421 setOperationAction(Op: ISD::FSIN , VT: MVT::f32, Action: Expand);
422 setOperationAction(Op: ISD::FCOS , VT: MVT::f32, Action: Expand);
423 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
424 setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: LibCall);
425 setOperationAction(Op: ISD::FPOW , VT: MVT::f32, Action: Expand);
426
427 // MASS transformation for LLVM intrinsics with replicating fast-math flag
428 // to be consistent to PPCGenScalarMASSEntries pass
429 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) {
430 setOperationAction(Op: ISD::FSIN , VT: MVT::f64, Action: Custom);
431 setOperationAction(Op: ISD::FCOS , VT: MVT::f64, Action: Custom);
432 setOperationAction(Op: ISD::FPOW , VT: MVT::f64, Action: Custom);
433 setOperationAction(Op: ISD::FLOG, VT: MVT::f64, Action: Custom);
434 setOperationAction(Op: ISD::FLOG10, VT: MVT::f64, Action: Custom);
435 setOperationAction(Op: ISD::FEXP, VT: MVT::f64, Action: Custom);
436 setOperationAction(Op: ISD::FSIN , VT: MVT::f32, Action: Custom);
437 setOperationAction(Op: ISD::FCOS , VT: MVT::f32, Action: Custom);
438 setOperationAction(Op: ISD::FPOW , VT: MVT::f32, Action: Custom);
439 setOperationAction(Op: ISD::FLOG, VT: MVT::f32, Action: Custom);
440 setOperationAction(Op: ISD::FLOG10, VT: MVT::f32, Action: Custom);
441 setOperationAction(Op: ISD::FEXP, VT: MVT::f32, Action: Custom);
442 }
443
444 if (Subtarget.hasSPE()) {
445 setOperationAction(Op: ISD::FMA , VT: MVT::f64, Action: Expand);
446 setOperationAction(Op: ISD::FMA , VT: MVT::f32, Action: Expand);
447 } else {
448 setOperationAction(Op: ISD::FMA , VT: MVT::f64, Action: Legal);
449 setOperationAction(Op: ISD::FMA , VT: MVT::f32, Action: Legal);
450 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
451 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
452 }
453
454 if (Subtarget.hasSPE())
455 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
456
457 // If we're enabling GP optimizations, use hardware square root
458 if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
459 setOperationAction(Op: ISD::FSQRT, VT: MVT::f64, Action: Expand);
460
461 if (!Subtarget.hasFSQRT() &&
462 !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
463 setOperationAction(Op: ISD::FSQRT, VT: MVT::f32, Action: Expand);
464
465 if (Subtarget.hasFCPSGN()) {
466 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Legal);
467 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Legal);
468 } else {
469 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Expand);
470 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Expand);
471 }
472
473 if (Subtarget.hasFPRND()) {
474 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
475 setOperationAction(Op: ISD::FCEIL, VT: MVT::f64, Action: Legal);
476 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f64, Action: Legal);
477 setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Legal);
478
479 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f32, Action: Legal);
480 setOperationAction(Op: ISD::FCEIL, VT: MVT::f32, Action: Legal);
481 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f32, Action: Legal);
482 setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Legal);
483 }
484
485 // Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP
486 // instruction xxbrd to speed up scalar BSWAP64.
487 if (Subtarget.isISA3_1()) {
488 setOperationAction(Op: ISD::BSWAP, VT: MVT::i32, Action: Legal);
489 setOperationAction(Op: ISD::BSWAP, VT: MVT::i64, Action: Legal);
490 } else {
491 setOperationAction(Op: ISD::BSWAP, VT: MVT::i32, Action: Expand);
492 setOperationAction(Op: ISD::BSWAP, VT: MVT::i64,
493 Action: ((Subtarget.hasP8Vector()) && isPPC64) ? Custom
494 : Expand);
495 }
496
497 // CTPOP or CTTZ were introduced in P8/P9 respectively
498 if (Subtarget.isISA3_0()) {
499 setOperationAction(Op: ISD::CTTZ , VT: MVT::i32 , Action: Legal);
500 setOperationAction(Op: ISD::CTTZ , VT: MVT::i64 , Action: Legal);
501 } else {
502 setOperationAction(Op: ISD::CTTZ , VT: MVT::i32 , Action: Expand);
503 setOperationAction(Op: ISD::CTTZ , VT: MVT::i64 , Action: Expand);
504 }
505
506 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
507 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32 , Action: Legal);
508 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64 , Action: Legal);
509 } else {
510 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32 , Action: Expand);
511 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64 , Action: Expand);
512 }
513
514 // PowerPC does not have ROTR
515 setOperationAction(Op: ISD::ROTR, VT: MVT::i32 , Action: Expand);
516 setOperationAction(Op: ISD::ROTR, VT: MVT::i64 , Action: Expand);
517
518 if (!Subtarget.useCRBits()) {
519 // PowerPC does not have Select
520 setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Expand);
521 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Expand);
522 setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Expand);
523 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Expand);
524 }
525
526 // PowerPC wants to turn select_cc of FP into fsel when possible.
527 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
528 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
529
530 // PowerPC wants to optimize integer setcc a bit
531 if (!Subtarget.useCRBits())
532 setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
533
534 if (Subtarget.hasFPU()) {
535 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Legal);
536 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Legal);
537 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Legal);
538
539 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Legal);
540 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Legal);
541 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Legal);
542 }
543
544 // PowerPC does not have BRCOND which requires SetCC
545 if (!Subtarget.useCRBits())
546 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Expand);
547
548 setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Expand);
549
550 if (Subtarget.hasSPE()) {
551 // SPE has built-in conversions
552 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Legal);
553 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Legal);
554 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Legal);
555 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Legal);
556 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Legal);
557 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Legal);
558
559 // SPE supports signaling compare of f32/f64.
560 // But it doesn't comply IEEE-754 rules for comparing
561 // special values like NaNs, Infs.
562 setOperationAction(Op: ISD::SETCC, VT: MVT::f32, Action: Custom);
563 setOperationAction(Op: ISD::SETCC, VT: MVT::f64, Action: Custom);
564 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Custom);
565 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Custom);
566 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Custom);
567 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Custom);
568 setOperationAction(Op: ISD::BR_CC, VT: MVT::f32, Action: Custom);
569 setOperationAction(Op: ISD::BR_CC, VT: MVT::f64, Action: Custom);
570 } else {
571 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
572 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
573 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
574
575 // PowerPC does not have [U|S]INT_TO_FP
576 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Expand);
577 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Expand);
578 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Expand);
579 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Expand);
580 }
581
582 if (Subtarget.hasDirectMove() && isPPC64) {
583 setOperationAction(Op: ISD::BITCAST, VT: MVT::f32, Action: Legal);
584 setOperationAction(Op: ISD::BITCAST, VT: MVT::i32, Action: Legal);
585 setOperationAction(Op: ISD::BITCAST, VT: MVT::i64, Action: Legal);
586 setOperationAction(Op: ISD::BITCAST, VT: MVT::f64, Action: Legal);
587
588 setOperationAction(Op: ISD::STRICT_LRINT, VT: MVT::f64, Action: Custom);
589 setOperationAction(Op: ISD::STRICT_LRINT, VT: MVT::f32, Action: Custom);
590 setOperationAction(Op: ISD::STRICT_LLRINT, VT: MVT::f64, Action: Custom);
591 setOperationAction(Op: ISD::STRICT_LLRINT, VT: MVT::f32, Action: Custom);
592 setOperationAction(Op: ISD::STRICT_LROUND, VT: MVT::f64, Action: Custom);
593 setOperationAction(Op: ISD::STRICT_LROUND, VT: MVT::f32, Action: Custom);
594 setOperationAction(Op: ISD::STRICT_LLROUND, VT: MVT::f64, Action: Custom);
595 setOperationAction(Op: ISD::STRICT_LLROUND, VT: MVT::f32, Action: Custom);
596 } else {
597 setOperationAction(Op: ISD::BITCAST, VT: MVT::f32, Action: Expand);
598 setOperationAction(Op: ISD::BITCAST, VT: MVT::i32, Action: Expand);
599 setOperationAction(Op: ISD::BITCAST, VT: MVT::i64, Action: Expand);
600 setOperationAction(Op: ISD::BITCAST, VT: MVT::f64, Action: Expand);
601 }
602
603 // We cannot sextinreg(i1). Expand to shifts.
604 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand);
605
606 // Custom handling for PowerPC ucmp instruction
607 setOperationAction(Op: ISD::UCMP, VT: MVT::i32, Action: Custom);
608 setOperationAction(Op: ISD::UCMP, VT: MVT::i64, Action: isPPC64 ? Custom : Expand);
609
610 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
611 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
612 // support continuation, user-level threading, and etc.. As a result, no
613 // other SjLj exception interfaces are implemented and please don't build
614 // your own exception handling based on them.
615 // LLVM/Clang supports zero-cost DWARF exception handling.
616 setOperationAction(Op: ISD::EH_SJLJ_SETJMP, VT: MVT::i32, Action: Custom);
617 setOperationAction(Op: ISD::EH_SJLJ_LONGJMP, VT: MVT::Other, Action: Custom);
618
619 // We want to legalize GlobalAddress and ConstantPool nodes into the
620 // appropriate instructions to materialize the address.
621 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i32, Action: Custom);
622 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i32, Action: Custom);
623 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i32, Action: Custom);
624 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i32, Action: Custom);
625 setOperationAction(Op: ISD::JumpTable, VT: MVT::i32, Action: Custom);
626 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
627 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
628 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
629 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
630 setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
631
632 // TRAP is legal.
633 setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
634
635 // TRAMPOLINE is custom lowered.
636 setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
637 setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
638
639 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
640 setOperationAction(Op: ISD::VASTART , VT: MVT::Other, Action: Custom);
641
642 if (Subtarget.is64BitELFABI()) {
643 // VAARG always uses double-word chunks, so promote anything smaller.
644 setOperationAction(Op: ISD::VAARG, VT: MVT::i1, Action: Promote);
645 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i1, DestVT: MVT::i64);
646 setOperationAction(Op: ISD::VAARG, VT: MVT::i8, Action: Promote);
647 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i8, DestVT: MVT::i64);
648 setOperationAction(Op: ISD::VAARG, VT: MVT::i16, Action: Promote);
649 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i16, DestVT: MVT::i64);
650 setOperationAction(Op: ISD::VAARG, VT: MVT::i32, Action: Promote);
651 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i32, DestVT: MVT::i64);
652 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Expand);
653 } else if (Subtarget.is32BitELFABI()) {
654 // VAARG is custom lowered with the 32-bit SVR4 ABI.
655 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
656 setOperationAction(Op: ISD::VAARG, VT: MVT::i64, Action: Custom);
657 } else
658 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Expand);
659
660 // VACOPY is custom lowered with the 32-bit SVR4 ABI.
661 if (Subtarget.is32BitELFABI())
662 setOperationAction(Op: ISD::VACOPY , VT: MVT::Other, Action: Custom);
663 else
664 setOperationAction(Op: ISD::VACOPY , VT: MVT::Other, Action: Expand);
665
666 // Use the default implementation.
667 setOperationAction(Op: ISD::VAEND , VT: MVT::Other, Action: Expand);
668 setOperationAction(Op: ISD::STACKSAVE , VT: MVT::Other, Action: Expand);
669 setOperationAction(Op: ISD::STACKRESTORE , VT: MVT::Other, Action: Custom);
670 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32 , Action: Custom);
671 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64 , Action: Custom);
672 setOperationAction(Op: ISD::GET_DYNAMIC_AREA_OFFSET, VT: MVT::i32, Action: Custom);
673 setOperationAction(Op: ISD::GET_DYNAMIC_AREA_OFFSET, VT: MVT::i64, Action: Custom);
674 setOperationAction(Op: ISD::EH_DWARF_CFA, VT: MVT::i32, Action: Custom);
675 setOperationAction(Op: ISD::EH_DWARF_CFA, VT: MVT::i64, Action: Custom);
676
677 if (Subtarget.isISA3_0() && isPPC64) {
678 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v16i1, Action: Custom);
679 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v8i1, Action: Custom);
680 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v4i1, Action: Custom);
681 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v2i1, Action: Custom);
682 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v16i1, Action: Custom);
683 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v8i1, Action: Custom);
684 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v4i1, Action: Custom);
685 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v2i1, Action: Custom);
686 }
687
688 // We want to custom lower some of our intrinsics.
689 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
690 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::f64, Action: Custom);
691 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::ppcf128, Action: Custom);
692 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::v4f32, Action: Custom);
693 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::v2f64, Action: Custom);
694
695 // To handle counter-based loop conditions.
696 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i1, Action: Custom);
697 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
698
699 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i8, Action: Custom);
700 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i16, Action: Custom);
701 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i32, Action: Custom);
702 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
703
704 // Comparisons that require checking two conditions.
705 if (Subtarget.hasSPE()) {
706 setCondCodeAction(CCs: ISD::SETO, VT: MVT::f32, Action: Expand);
707 setCondCodeAction(CCs: ISD::SETO, VT: MVT::f64, Action: Expand);
708 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::f32, Action: Expand);
709 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::f64, Action: Expand);
710 }
711 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f32, Action: Expand);
712 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f64, Action: Expand);
713 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f32, Action: Expand);
714 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f64, Action: Expand);
715 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f32, Action: Expand);
716 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f64, Action: Expand);
717 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f32, Action: Expand);
718 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f64, Action: Expand);
719 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f32, Action: Expand);
720 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f64, Action: Expand);
721 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f32, Action: Expand);
722 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f64, Action: Expand);
723
724 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f32, Action: Legal);
725 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f64, Action: Legal);
726
727 if (Subtarget.has64BitSupport()) {
728 // They also have instructions for converting between i64 and fp.
729 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
730 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Expand);
731 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
732 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Expand);
733 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
734 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Expand);
735 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
736 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Expand);
737 // This is just the low 32 bits of a (signed) fp->i64 conversion.
738 // We cannot do this with Promote because i64 is not a legal type.
739 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
740 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
741
742 if (Subtarget.hasLFIWAX() || isPPC64) {
743 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
744 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
745 }
746 } else {
747 // PowerPC does not have FP_TO_UINT on 32-bit implementations.
748 if (Subtarget.hasSPE()) {
749 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Legal);
750 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Legal);
751 } else {
752 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Expand);
753 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Expand);
754 }
755 }
756
757 // With the instructions enabled under FPCVT, we can do everything.
758 if (Subtarget.hasFPCVT()) {
759 if (Subtarget.has64BitSupport()) {
760 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
761 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
762 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
763 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
764 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
765 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
766 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
767 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
768 }
769
770 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
771 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
772 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
773 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
774 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
775 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
776 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
777 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
778 }
779
780 if (Subtarget.use64BitRegs()) {
781 // 64-bit PowerPC implementations can support i64 types directly
782 addRegisterClass(VT: MVT::i64, RC: &PPC::G8RCRegClass);
783 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
784 setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
785 // 64-bit PowerPC wants to expand i128 shifts itself.
786 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
787 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
788 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
789 } else {
790 // 32-bit PowerPC wants to expand i64 shifts itself.
791 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i32, Action: Custom);
792 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i32, Action: Custom);
793 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i32, Action: Custom);
794 }
795
796 // PowerPC has better expansions for funnel shifts than the generic
797 // TargetLowering::expandFunnelShift.
798 if (Subtarget.has64BitSupport()) {
799 setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
800 setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
801 }
802 setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
803 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
804
805 if (Subtarget.hasVSX()) {
806 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT: MVT::f64, Action: Legal);
807 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT: MVT::f32, Action: Legal);
808 setOperationAction(Op: ISD::FMINNUM_IEEE, VT: MVT::f64, Action: Legal);
809 setOperationAction(Op: ISD::FMINNUM_IEEE, VT: MVT::f32, Action: Legal);
810 setOperationAction(Op: ISD::FMAXNUM, VT: MVT::f64, Action: Legal);
811 setOperationAction(Op: ISD::FMAXNUM, VT: MVT::f32, Action: Legal);
812 setOperationAction(Op: ISD::FMINNUM, VT: MVT::f64, Action: Legal);
813 setOperationAction(Op: ISD::FMINNUM, VT: MVT::f32, Action: Legal);
814 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f64, Action: Legal);
815 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f32, Action: Legal);
816 }
817
818 if (Subtarget.hasAltivec()) {
819 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
820 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
821 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
822 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
823 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
824 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
825 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
826 }
827 // First set operation action for all vector types to expand. Then we
828 // will selectively turn on ones that can be effectively codegen'd.
829 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
830 // add/sub are legal for all supported vector VT's.
831 setOperationAction(Op: ISD::ADD, VT, Action: Legal);
832 setOperationAction(Op: ISD::SUB, VT, Action: Legal);
833
834 // For v2i64, these are only valid with P8Vector. This is corrected after
835 // the loop.
836 if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
837 setOperationAction(Op: ISD::SMAX, VT, Action: Legal);
838 setOperationAction(Op: ISD::SMIN, VT, Action: Legal);
839 setOperationAction(Op: ISD::UMAX, VT, Action: Legal);
840 setOperationAction(Op: ISD::UMIN, VT, Action: Legal);
841 }
842 else {
843 setOperationAction(Op: ISD::SMAX, VT, Action: Expand);
844 setOperationAction(Op: ISD::SMIN, VT, Action: Expand);
845 setOperationAction(Op: ISD::UMAX, VT, Action: Expand);
846 setOperationAction(Op: ISD::UMIN, VT, Action: Expand);
847 }
848
849 if (Subtarget.hasVSX()) {
850 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT, Action: Legal);
851 setOperationAction(Op: ISD::FMINNUM_IEEE, VT, Action: Legal);
852 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Legal);
853 setOperationAction(Op: ISD::FMINNUM, VT, Action: Legal);
854 setOperationAction(Op: ISD::FCANONICALIZE, VT, Action: Legal);
855 }
856
857 // Vector instructions introduced in P8
858 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
859 setOperationAction(Op: ISD::CTPOP, VT, Action: Legal);
860 setOperationAction(Op: ISD::CTLZ, VT, Action: Legal);
861 }
862 else {
863 setOperationAction(Op: ISD::CTPOP, VT, Action: Expand);
864 setOperationAction(Op: ISD::CTLZ, VT, Action: Expand);
865 }
866
867 // Vector instructions introduced in P9
868 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
869 setOperationAction(Op: ISD::CTTZ, VT, Action: Legal);
870 else
871 setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
872
873 // We promote all shuffles to v16i8.
874 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Promote);
875 AddPromotedToType (Opc: ISD::VECTOR_SHUFFLE, OrigVT: VT, DestVT: MVT::v16i8);
876
877 // We promote all non-typed operations to v4i32.
878 setOperationAction(Op: ISD::AND , VT, Action: Promote);
879 AddPromotedToType (Opc: ISD::AND , OrigVT: VT, DestVT: MVT::v4i32);
880 setOperationAction(Op: ISD::OR , VT, Action: Promote);
881 AddPromotedToType (Opc: ISD::OR , OrigVT: VT, DestVT: MVT::v4i32);
882 setOperationAction(Op: ISD::XOR , VT, Action: Promote);
883 AddPromotedToType (Opc: ISD::XOR , OrigVT: VT, DestVT: MVT::v4i32);
884 setOperationAction(Op: ISD::LOAD , VT, Action: Promote);
885 AddPromotedToType (Opc: ISD::LOAD , OrigVT: VT, DestVT: MVT::v4i32);
886 setOperationAction(Op: ISD::SELECT, VT, Action: Promote);
887 AddPromotedToType (Opc: ISD::SELECT, OrigVT: VT, DestVT: MVT::v4i32);
888 setOperationAction(Op: ISD::VSELECT, VT, Action: Legal);
889 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Promote);
890 AddPromotedToType (Opc: ISD::SELECT_CC, OrigVT: VT, DestVT: MVT::v4i32);
891 setOperationAction(Op: ISD::STORE, VT, Action: Promote);
892 AddPromotedToType (Opc: ISD::STORE, OrigVT: VT, DestVT: MVT::v4i32);
893
894 // No other operations are legal.
895 setOperationAction(Op: ISD::MUL , VT, Action: Expand);
896 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
897 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
898 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
899 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
900 setOperationAction(Op: ISD::FDIV, VT, Action: Expand);
901 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
902 setOperationAction(Op: ISD::FNEG, VT, Action: Expand);
903 setOperationAction(Op: ISD::FSQRT, VT, Action: Expand);
904 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
905 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
906 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
907 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
908 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
909 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
910 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
911 setOperationAction(Op: ISD::FABS, VT, Action: Expand);
912 setOperationAction(Op: ISD::FFLOOR, VT, Action: Expand);
913 setOperationAction(Op: ISD::FCEIL, VT, Action: Expand);
914 setOperationAction(Op: ISD::FTRUNC, VT, Action: Expand);
915 setOperationAction(Op: ISD::FRINT, VT, Action: Expand);
916 setOperationAction(Op: ISD::FLDEXP, VT, Action: Expand);
917 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Expand);
918 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Expand);
919 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Expand);
920 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Expand);
921 setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
922 setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
923 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
924 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
925 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
926 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
927 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: Expand);
928 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
929 setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
930 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
931 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
932 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
933
934 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
935 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
936 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
937 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
938 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
939 }
940 }
941 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::v4i32, Action: Expand);
942 if (!Subtarget.hasP8Vector()) {
943 setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Expand);
944 setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Expand);
945 setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Expand);
946 setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Expand);
947 }
948
949 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
950 // with merges, splats, etc.
951 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v16i8, Action: Custom);
952
953 // Vector truncates to sub-word integer that fit in an Altivec/VSX register
954 // are cheap, so handle them before they get expanded to scalar.
955 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v8i8, Action: Custom);
956 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v4i8, Action: Custom);
957 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v2i8, Action: Custom);
958 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v4i16, Action: Custom);
959 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v2i16, Action: Custom);
960
961 setOperationAction(Op: ISD::AND , VT: MVT::v4i32, Action: Legal);
962 setOperationAction(Op: ISD::OR , VT: MVT::v4i32, Action: Legal);
963 setOperationAction(Op: ISD::XOR , VT: MVT::v4i32, Action: Legal);
964 setOperationAction(Op: ISD::LOAD , VT: MVT::v4i32, Action: Legal);
965 setOperationAction(Op: ISD::SELECT, VT: MVT::v4i32,
966 Action: Subtarget.useCRBits() ? Legal : Expand);
967 setOperationAction(Op: ISD::STORE , VT: MVT::v4i32, Action: Legal);
968 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::v4i32, Action: Legal);
969 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::v4i32, Action: Legal);
970 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i32, Action: Legal);
971 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i32, Action: Legal);
972 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::v4i32, Action: Legal);
973 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::v4i32, Action: Legal);
974 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i32, Action: Legal);
975 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i32, Action: Legal);
976 setOperationAction(Op: ISD::FFLOOR, VT: MVT::v4f32, Action: Legal);
977 setOperationAction(Op: ISD::FCEIL, VT: MVT::v4f32, Action: Legal);
978 setOperationAction(Op: ISD::FTRUNC, VT: MVT::v4f32, Action: Legal);
979 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::v4f32, Action: Legal);
980
981 // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
982 setOperationAction(Op: ISD::ROTL, VT: MVT::v1i128, Action: Custom);
983 // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
984 if (Subtarget.hasAltivec())
985 for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
986 setOperationAction(Op: ISD::ROTL, VT, Action: Legal);
987 // With hasP8Altivec set, we can lower ISD::ROTL to vrld.
988 if (Subtarget.hasP8Altivec())
989 setOperationAction(Op: ISD::ROTL, VT: MVT::v2i64, Action: Legal);
990
991 addRegisterClass(VT: MVT::v4f32, RC: &PPC::VRRCRegClass);
992 addRegisterClass(VT: MVT::v4i32, RC: &PPC::VRRCRegClass);
993 addRegisterClass(VT: MVT::v8i16, RC: &PPC::VRRCRegClass);
994 addRegisterClass(VT: MVT::v16i8, RC: &PPC::VRRCRegClass);
995
996 setOperationAction(Op: ISD::MUL, VT: MVT::v4f32, Action: Legal);
997 setOperationAction(Op: ISD::FMA, VT: MVT::v4f32, Action: Legal);
998
999 if (Subtarget.hasVSX()) {
1000 setOperationAction(Op: ISD::FDIV, VT: MVT::v4f32, Action: Legal);
1001 setOperationAction(Op: ISD::FSQRT, VT: MVT::v4f32, Action: Legal);
1002 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2f64, Action: Custom);
1003 }
1004
1005 if (Subtarget.hasP8Altivec())
1006 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Legal);
1007 else
1008 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
1009
1010 if (Subtarget.isISA3_1()) {
1011 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Legal);
1012 setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Legal);
1013 setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Legal);
1014 setOperationAction(Op: ISD::MULHS, VT: MVT::v4i32, Action: Legal);
1015 setOperationAction(Op: ISD::MULHU, VT: MVT::v4i32, Action: Legal);
1016 setOperationAction(Op: ISD::UDIV, VT: MVT::v2i64, Action: Legal);
1017 setOperationAction(Op: ISD::SDIV, VT: MVT::v2i64, Action: Legal);
1018 setOperationAction(Op: ISD::UDIV, VT: MVT::v4i32, Action: Legal);
1019 setOperationAction(Op: ISD::SDIV, VT: MVT::v4i32, Action: Legal);
1020 setOperationAction(Op: ISD::UREM, VT: MVT::v2i64, Action: Legal);
1021 setOperationAction(Op: ISD::SREM, VT: MVT::v2i64, Action: Legal);
1022 setOperationAction(Op: ISD::UREM, VT: MVT::v4i32, Action: Legal);
1023 setOperationAction(Op: ISD::SREM, VT: MVT::v4i32, Action: Legal);
1024 setOperationAction(Op: ISD::UREM, VT: MVT::v1i128, Action: Legal);
1025 setOperationAction(Op: ISD::SREM, VT: MVT::v1i128, Action: Legal);
1026 setOperationAction(Op: ISD::UDIV, VT: MVT::v1i128, Action: Legal);
1027 setOperationAction(Op: ISD::SDIV, VT: MVT::v1i128, Action: Legal);
1028 setOperationAction(Op: ISD::ROTL, VT: MVT::v1i128, Action: Legal);
1029 }
1030
1031 setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Legal);
1032 setOperationAction(Op: ISD::MUL, VT: MVT::v16i8, Action: Custom);
1033
1034 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4f32, Action: Custom);
1035 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4i32, Action: Custom);
1036 // LE is P8+/64-bit so direct moves are supported and these operations
1037 // are legal. The custom transformation requires 64-bit since we need a
1038 // pair of stores that will cover a 128-bit load for P10.
1039 if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
1040 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2i64, Action: Custom);
1041 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v8i16, Action: Custom);
1042 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v16i8, Action: Custom);
1043 }
1044
1045 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v16i8, Action: Custom);
1046 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v8i16, Action: Custom);
1047 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4i32, Action: Custom);
1048 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4f32, Action: Custom);
1049
1050 // Altivec does not contain unordered floating-point compare instructions
1051 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::v4f32, Action: Expand);
1052 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::v4f32, Action: Expand);
1053 setCondCodeAction(CCs: ISD::SETO, VT: MVT::v4f32, Action: Expand);
1054 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::v4f32, Action: Expand);
1055
1056 if (Subtarget.hasVSX()) {
1057 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2f64, Action: Legal);
1058 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f64, Action: Legal);
1059 if (Subtarget.hasP8Vector()) {
1060 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4f32, Action: Legal);
1061 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4f32, Action: Legal);
1062 }
1063 if (Subtarget.hasDirectMove() && isPPC64) {
1064 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v16i8, Action: Legal);
1065 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v8i16, Action: Legal);
1066 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4i32, Action: Legal);
1067 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2i64, Action: Legal);
1068 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v16i8, Action: Legal);
1069 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v8i16, Action: Legal);
1070 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4i32, Action: Legal);
1071 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2i64, Action: Legal);
1072 }
1073 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f64, Action: Legal);
1074
1075 // The nearbyint variants are not allowed to raise the inexact exception
1076 // so we can only code-gen them with fpexcept.ignore.
1077 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f64, Action: Custom);
1078 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f32, Action: Custom);
1079 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::v2f64, Action: Custom);
1080 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::v4f32, Action: Custom);
1081
1082 setOperationAction(Op: ISD::FFLOOR, VT: MVT::v2f64, Action: Legal);
1083 setOperationAction(Op: ISD::FCEIL, VT: MVT::v2f64, Action: Legal);
1084 setOperationAction(Op: ISD::FTRUNC, VT: MVT::v2f64, Action: Legal);
1085 setOperationAction(Op: ISD::FRINT, VT: MVT::v2f64, Action: Legal);
1086 setOperationAction(Op: ISD::FROUND, VT: MVT::v2f64, Action: Legal);
1087 setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Legal);
1088 setOperationAction(Op: ISD::FRINT, VT: MVT::f64, Action: Legal);
1089
1090 setOperationAction(Op: ISD::FRINT, VT: MVT::v4f32, Action: Legal);
1091 setOperationAction(Op: ISD::FROUND, VT: MVT::v4f32, Action: Legal);
1092 setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Legal);
1093 setOperationAction(Op: ISD::FRINT, VT: MVT::f32, Action: Legal);
1094
1095 setOperationAction(Op: ISD::MUL, VT: MVT::v2f64, Action: Legal);
1096 setOperationAction(Op: ISD::FMA, VT: MVT::v2f64, Action: Legal);
1097
1098 setOperationAction(Op: ISD::FDIV, VT: MVT::v2f64, Action: Legal);
1099 setOperationAction(Op: ISD::FSQRT, VT: MVT::v2f64, Action: Legal);
1100
1101 // Share the Altivec comparison restrictions.
1102 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::v2f64, Action: Expand);
1103 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::v2f64, Action: Expand);
1104 setCondCodeAction(CCs: ISD::SETO, VT: MVT::v2f64, Action: Expand);
1105 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::v2f64, Action: Expand);
1106
1107 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Legal);
1108 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Legal);
1109
1110 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2f64, Action: Custom);
1111
1112 if (Subtarget.hasP8Vector())
1113 addRegisterClass(VT: MVT::f32, RC: &PPC::VSSRCRegClass);
1114
1115 addRegisterClass(VT: MVT::f64, RC: &PPC::VSFRCRegClass);
1116
1117 addRegisterClass(VT: MVT::v4i32, RC: &PPC::VSRCRegClass);
1118 addRegisterClass(VT: MVT::v4f32, RC: &PPC::VSRCRegClass);
1119 addRegisterClass(VT: MVT::v2f64, RC: &PPC::VSRCRegClass);
1120
1121 if (Subtarget.hasP8Altivec()) {
1122 setOperationAction(Op: ISD::SHL, VT: MVT::v2i64, Action: Legal);
1123 setOperationAction(Op: ISD::SRA, VT: MVT::v2i64, Action: Legal);
1124 setOperationAction(Op: ISD::SRL, VT: MVT::v2i64, Action: Legal);
1125
1126 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1127 // SRL, but not for SRA because of the instructions available:
1128 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
1129 // doing
1130 setOperationAction(Op: ISD::SHL, VT: MVT::v1i128, Action: Expand);
1131 setOperationAction(Op: ISD::SRL, VT: MVT::v1i128, Action: Expand);
1132 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Expand);
1133
1134 setOperationAction(Op: ISD::SETCC, VT: MVT::v2i64, Action: Legal);
1135 }
1136 else {
1137 setOperationAction(Op: ISD::SHL, VT: MVT::v2i64, Action: Expand);
1138 setOperationAction(Op: ISD::SRA, VT: MVT::v2i64, Action: Expand);
1139 setOperationAction(Op: ISD::SRL, VT: MVT::v2i64, Action: Expand);
1140
1141 setOperationAction(Op: ISD::SETCC, VT: MVT::v2i64, Action: Custom);
1142
1143 // VSX v2i64 only supports non-arithmetic operations.
1144 setOperationAction(Op: ISD::ADD, VT: MVT::v2i64, Action: Expand);
1145 setOperationAction(Op: ISD::SUB, VT: MVT::v2i64, Action: Expand);
1146 }
1147
1148 if (Subtarget.isISA3_1())
1149 setOperationAction(Op: ISD::SETCC, VT: MVT::v1i128, Action: Legal);
1150 else
1151 setOperationAction(Op: ISD::SETCC, VT: MVT::v1i128, Action: Expand);
1152
1153 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
1154 AddPromotedToType (Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v2f64);
1155 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
1156 AddPromotedToType (Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v2f64);
1157
1158 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2i64, Action: Custom);
1159
1160 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1161 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1162 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::v2i64, Action: Legal);
1163 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::v2i64, Action: Legal);
1164 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1165 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1166 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::v2i64, Action: Legal);
1167 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::v2i64, Action: Legal);
1168
1169 // Custom handling for partial vectors of integers converted to
1170 // floating point. We already have optimal handling for v2i32 through
1171 // the DAG combine, so those aren't necessary.
1172 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1173 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1174 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1175 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1176 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1177 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1178 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1179 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1180 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1181 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1182 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1183 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1184 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1185 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1186 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1187 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1188
1189 setOperationAction(Op: ISD::FNEG, VT: MVT::v4f32, Action: Legal);
1190 setOperationAction(Op: ISD::FNEG, VT: MVT::v2f64, Action: Legal);
1191 setOperationAction(Op: ISD::FABS, VT: MVT::v4f32, Action: Legal);
1192 setOperationAction(Op: ISD::FABS, VT: MVT::v2f64, Action: Legal);
1193 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v4f32, Action: Legal);
1194 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v2f64, Action: Legal);
1195
1196 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2i64, Action: Custom);
1197 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2f64, Action: Custom);
1198
1199 // Handle constrained floating-point operations of vector.
1200 // The predictor is `hasVSX` because altivec instruction has
1201 // no exception but VSX vector instruction has.
1202 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::v4f32, Action: Legal);
1203 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::v4f32, Action: Legal);
1204 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::v4f32, Action: Legal);
1205 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::v4f32, Action: Legal);
1206 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::v4f32, Action: Legal);
1207 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::v4f32, Action: Legal);
1208 setOperationAction(Op: ISD::STRICT_FMAXNUM, VT: MVT::v4f32, Action: Legal);
1209 setOperationAction(Op: ISD::STRICT_FMINNUM, VT: MVT::v4f32, Action: Legal);
1210 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::v4f32, Action: Legal);
1211 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::v4f32, Action: Legal);
1212 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::v4f32, Action: Legal);
1213 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::v4f32, Action: Legal);
1214 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::v4f32, Action: Legal);
1215
1216 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::v2f64, Action: Legal);
1217 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::v2f64, Action: Legal);
1218 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::v2f64, Action: Legal);
1219 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::v2f64, Action: Legal);
1220 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::v2f64, Action: Legal);
1221 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::v2f64, Action: Legal);
1222 setOperationAction(Op: ISD::STRICT_FMAXNUM, VT: MVT::v2f64, Action: Legal);
1223 setOperationAction(Op: ISD::STRICT_FMINNUM, VT: MVT::v2f64, Action: Legal);
1224 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::v2f64, Action: Legal);
1225 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::v2f64, Action: Legal);
1226 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::v2f64, Action: Legal);
1227 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::v2f64, Action: Legal);
1228 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::v2f64, Action: Legal);
1229
1230 addRegisterClass(VT: MVT::v2i64, RC: &PPC::VSRCRegClass);
1231 addRegisterClass(VT: MVT::f128, RC: &PPC::VRRCRegClass);
1232
1233 for (MVT FPT : MVT::fp_valuetypes())
1234 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: FPT, Action: Expand);
1235
1236 // Expand the SELECT to SELECT_CC
1237 setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Expand);
1238
1239 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f64, Action: Expand);
1240 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f32, Action: Expand);
1241
1242 // No implementation for these ops for PowerPC.
1243 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
1244 setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
1245 setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
1246 setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
1247 setOperationAction(Op: ISD::FPOWI, VT: MVT::f128, Action: Expand);
1248 setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: LibCall);
1249 }
1250
1251 if (Subtarget.hasP8Altivec()) {
1252 addRegisterClass(VT: MVT::v2i64, RC: &PPC::VRRCRegClass);
1253 addRegisterClass(VT: MVT::v1i128, RC: &PPC::VRRCRegClass);
1254 }
1255
1256 if (Subtarget.hasP9Vector()) {
1257 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i32, Action: Custom);
1258 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4f32, Action: Custom);
1259
1260 // Test data class instructions store results in CR bits.
1261 if (Subtarget.useCRBits()) {
1262 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f32, Action: Custom);
1263 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f64, Action: Custom);
1264 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f128, Action: Custom);
1265 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::ppcf128, Action: Custom);
1266 }
1267
1268 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1269 // SRL, but not for SRA because of the instructions available:
1270 // VS{RL} and VS{RL}O.
1271 setOperationAction(Op: ISD::SHL, VT: MVT::v1i128, Action: Legal);
1272 setOperationAction(Op: ISD::SRL, VT: MVT::v1i128, Action: Legal);
1273 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Expand);
1274
1275 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: Legal);
1276 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: Legal);
1277 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: Legal);
1278 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: Legal);
1279 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Legal);
1280
1281 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Legal);
1282 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f128, Action: Expand);
1283 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f128, Action: Expand);
1284 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f128, Action: Expand);
1285 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f128, Action: Expand);
1286 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f128, Action: Expand);
1287 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f128, Action: Expand);
1288
1289 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Legal);
1290 setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Legal);
1291 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f128, Action: Legal);
1292 setOperationAction(Op: ISD::FCEIL, VT: MVT::f128, Action: Legal);
1293 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::f128, Action: Legal);
1294 setOperationAction(Op: ISD::FROUND, VT: MVT::f128, Action: Legal);
1295
1296 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Legal);
1297 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Legal);
1298 setOperationAction(Op: ISD::BITCAST, VT: MVT::i128, Action: Custom);
1299
1300 // Handle constrained floating-point operations of fp128
1301 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f128, Action: Legal);
1302 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f128, Action: Legal);
1303 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f128, Action: Legal);
1304 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f128, Action: Legal);
1305 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f128, Action: Legal);
1306 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f128, Action: Legal);
1307 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f128, Action: Legal);
1308 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Legal);
1309 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Legal);
1310 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f128, Action: Legal);
1311 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f128, Action: Legal);
1312 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f128, Action: Legal);
1313 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f128, Action: Legal);
1314 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f128, Action: Legal);
1315 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f128, Action: Legal);
1316 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Custom);
1317 setOperationAction(Op: ISD::BSWAP, VT: MVT::v8i16, Action: Legal);
1318 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i32, Action: Legal);
1319 setOperationAction(Op: ISD::BSWAP, VT: MVT::v2i64, Action: Legal);
1320 setOperationAction(Op: ISD::BSWAP, VT: MVT::v1i128, Action: Legal);
1321 } else if (Subtarget.hasVSX()) {
1322 setOperationAction(Op: ISD::LOAD, VT: MVT::f128, Action: Promote);
1323 setOperationAction(Op: ISD::STORE, VT: MVT::f128, Action: Promote);
1324
1325 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f128, DestVT: MVT::v4i32);
1326 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f128, DestVT: MVT::v4i32);
1327
1328 // Set FADD/FSUB as libcall to avoid the legalizer to expand the
1329 // fp_to_uint and int_to_fp.
1330 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
1331 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
1332
1333 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: Expand);
1334 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: Expand);
1335 setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
1336 setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
1337 setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
1338 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
1339 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
1340
1341 // Expand the fp_extend if the target type is fp128.
1342 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Expand);
1343 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f128, Action: Expand);
1344
1345 // Expand the fp_round if the source type is fp128.
1346 for (MVT VT : {MVT::f32, MVT::f64}) {
1347 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1348 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Custom);
1349 }
1350
1351 setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
1352 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
1353 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
1354 setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Expand);
1355
1356 // Lower following f128 select_cc pattern:
1357 // select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE
1358 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
1359
1360 // We need to handle f128 SELECT_CC with integer result type.
1361 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
1362 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: isPPC64 ? Custom : Expand);
1363 }
1364
1365 if (Subtarget.hasP9Altivec()) {
1366 if (Subtarget.isISA3_1()) {
1367 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2i64, Action: Legal);
1368 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v8i16, Action: Legal);
1369 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v16i8, Action: Legal);
1370 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i32, Action: Legal);
1371 } else {
1372 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v8i16, Action: Custom);
1373 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v16i8, Action: Custom);
1374 }
1375 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i8, Action: Legal);
1376 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i16, Action: Legal);
1377 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i32, Action: Legal);
1378 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i8, Action: Legal);
1379 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i16, Action: Legal);
1380 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i32, Action: Legal);
1381 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i64, Action: Legal);
1382
1383 setOperationAction(Op: ISD::ABDU, VT: MVT::v16i8, Action: Legal);
1384 setOperationAction(Op: ISD::ABDU, VT: MVT::v8i16, Action: Legal);
1385 setOperationAction(Op: ISD::ABDU, VT: MVT::v4i32, Action: Legal);
1386 setOperationAction(Op: ISD::ABDS, VT: MVT::v4i32, Action: Legal);
1387 }
1388
1389 if (Subtarget.hasP10Vector()) {
1390 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
1391 }
1392
1393 setOperationAction(Op: ISD::PARTIAL_REDUCE_UMLA, VT: MVT::v16i32, Action: Custom);
1394 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_UMLA, AccVT: MVT::v4i32, InputVT: MVT::v8i16,
1395 Action: Legal);
1396 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SMLA, AccVT: MVT::v4i32, InputVT: MVT::v8i16,
1397 Action: Legal);
1398 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_UMLA, AccVT: MVT::v4i32, InputVT: MVT::v16i8,
1399 Action: Legal);
1400 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v4i32, InputVT: MVT::v16i8,
1401 Action: Legal);
1402 }
1403
1404 if (Subtarget.pairedVectorMemops()) {
1405 addRegisterClass(VT: MVT::v256i1, RC: &PPC::VSRpRCRegClass);
1406 setOperationAction(Op: ISD::LOAD, VT: MVT::v256i1, Action: Custom);
1407 setOperationAction(Op: ISD::STORE, VT: MVT::v256i1, Action: Custom);
1408 }
1409 if (Subtarget.hasMMA()) {
1410 if (Subtarget.isISAFuture()) {
1411 addRegisterClass(VT: MVT::v512i1, RC: &PPC::WACCRCRegClass);
1412 addRegisterClass(VT: MVT::v1024i1, RC: &PPC::DMRRCRegClass);
1413 addRegisterClass(VT: MVT::v2048i1, RC: &PPC::DMRpRCRegClass);
1414 setOperationAction(Op: ISD::LOAD, VT: MVT::v1024i1, Action: Custom);
1415 setOperationAction(Op: ISD::STORE, VT: MVT::v1024i1, Action: Custom);
1416 setOperationAction(Op: ISD::LOAD, VT: MVT::v2048i1, Action: Custom);
1417 setOperationAction(Op: ISD::STORE, VT: MVT::v2048i1, Action: Custom);
1418 } else {
1419 addRegisterClass(VT: MVT::v512i1, RC: &PPC::UACCRCRegClass);
1420 }
1421 setOperationAction(Op: ISD::LOAD, VT: MVT::v512i1, Action: Custom);
1422 setOperationAction(Op: ISD::STORE, VT: MVT::v512i1, Action: Custom);
1423 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v512i1, Action: Custom);
1424 }
1425
1426 if (Subtarget.has64BitSupport())
1427 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Legal);
1428
1429 if (Subtarget.isISA3_1())
1430 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Legal);
1431
1432 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: isPPC64 ? Legal : Custom);
1433
1434 if (!isPPC64) {
1435 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i64, Action: Expand);
1436 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i64, Action: Expand);
1437 }
1438
1439 if (shouldInlineQuadwordAtomics()) {
1440 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
1441 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
1442 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i128, Action: Custom);
1443 }
1444
1445 setBooleanContents(ZeroOrOneBooleanContent);
1446
1447 if (Subtarget.hasAltivec()) {
1448 // Altivec instructions set fields to all zeros or all ones.
1449 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
1450 }
1451
1452 if (shouldInlineQuadwordAtomics())
1453 setMaxAtomicSizeInBitsSupported(128);
1454 else if (isPPC64)
1455 setMaxAtomicSizeInBitsSupported(64);
1456 else
1457 setMaxAtomicSizeInBitsSupported(32);
1458
1459 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
1460
1461 // We have target-specific dag combine patterns for the following nodes:
1462 setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::XOR, ISD::SHL, ISD::SRA,
1463 ISD::SRL, ISD::MUL, ISD::FMA, ISD::SINT_TO_FP,
1464 ISD::BUILD_VECTOR});
1465 if (Subtarget.hasFPCVT())
1466 setTargetDAGCombine(ISD::UINT_TO_FP);
1467 setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});
1468 if (Subtarget.useCRBits())
1469 setTargetDAGCombine(ISD::BRCOND);
1470 setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,
1471 ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});
1472
1473 setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});
1474
1475 setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});
1476
1477 if (Subtarget.useCRBits()) {
1478 setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
1479 }
1480
1481 if (Subtarget.hasP8Vector())
1482 setTargetDAGCombine(ISD::BITCAST);
1483
1484 // With 32 condition bits, we don't need to sink (and duplicate) compares
1485 // aggressively in CodeGenPrep.
1486 if (Subtarget.useCRBits()) {
1487 setJumpIsExpensive();
1488 }
1489
1490 // TODO: The default entry number is set to 64. This stops most jump table
1491 // generation on PPC. But it is good for current PPC HWs because the indirect
1492 // branch instruction mtctr to the jump table may lead to bad branch predict.
1493 // Re-evaluate this value on future HWs that can do better with mtctr.
1494 setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
1495
1496 // The default minimum of largest number in a BitTest cluster is 3.
1497 setMinimumBitTestCmps(PPCMinimumBitTestCmps);
1498
1499 setMinFunctionAlignment(Align(4));
1500 setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
1501
1502 auto CPUDirective = Subtarget.getCPUDirective();
1503 switch (CPUDirective) {
1504 default: break;
1505 case PPC::DIR_970:
1506 case PPC::DIR_A2:
1507 case PPC::DIR_E500:
1508 case PPC::DIR_E500mc:
1509 case PPC::DIR_E5500:
1510 case PPC::DIR_PWR4:
1511 case PPC::DIR_PWR5:
1512 case PPC::DIR_PWR5X:
1513 case PPC::DIR_PWR6:
1514 case PPC::DIR_PWR6X:
1515 case PPC::DIR_PWR7:
1516 case PPC::DIR_PWR8:
1517 case PPC::DIR_PWR9:
1518 case PPC::DIR_PWR10:
1519 case PPC::DIR_PWR11:
1520 case PPC::DIR_PWR_FUTURE:
1521 setPrefLoopAlignment(Align(16));
1522 setPrefFunctionAlignment(Align(16));
1523 break;
1524 }
1525
1526 if (Subtarget.enableMachineScheduler())
1527 setSchedulingPreference(Sched::Source);
1528 else
1529 setSchedulingPreference(Sched::Hybrid);
1530
1531 computeRegisterProperties(TRI: STI.getRegisterInfo());
1532
1533 // The Freescale cores do better with aggressive inlining of memcpy and
1534 // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
1535 if (CPUDirective == PPC::DIR_E500mc || CPUDirective == PPC::DIR_E5500) {
1536 MaxStoresPerMemset = 32;
1537 MaxStoresPerMemsetOptSize = 16;
1538 MaxStoresPerMemcpy = 32;
1539 MaxStoresPerMemcpyOptSize = 8;
1540 MaxStoresPerMemmove = 32;
1541 MaxStoresPerMemmoveOptSize = 8;
1542 } else if (CPUDirective == PPC::DIR_A2) {
1543 // The A2 also benefits from (very) aggressive inlining of memcpy and
1544 // friends. The overhead of a the function call, even when warm, can be
1545 // over one hundred cycles.
1546 MaxStoresPerMemset = 128;
1547 MaxStoresPerMemcpy = 128;
1548 MaxStoresPerMemmove = 128;
1549 MaxLoadsPerMemcmp = 128;
1550 } else {
1551 MaxLoadsPerMemcmp = 8;
1552 MaxLoadsPerMemcmpOptSize = 4;
1553 }
1554
1555 // Enable generation of STXVP instructions by default for mcpu=future.
1556 if (CPUDirective == PPC::DIR_PWR_FUTURE &&
1557 DisableAutoPairedVecSt.getNumOccurrences() == 0)
1558 DisableAutoPairedVecSt = false;
1559
1560 IsStrictFPEnabled = true;
1561
1562 // Let the subtarget (CPU) decide if a predictable select is more expensive
1563 // than the corresponding branch. This information is used in CGP to decide
1564 // when to convert selects into branches.
1565 PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
1566
1567 GatherAllAliasesMaxDepth = PPCGatherAllAliasesMaxDepth;
1568}
1569
1570// *********************************** NOTE ************************************
1571// For selecting load and store instructions, the addressing modes are defined
1572// as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD
1573// patterns to match the load the store instructions.
1574//
1575// The TD definitions for the addressing modes correspond to their respective
1576// Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely
1577// on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the
1578// address mode flags of a particular node. Afterwards, the computed address
1579// flags are passed into getAddrModeForFlags() in order to retrieve the optimal
1580// addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement
1581// accordingly, based on the preferred addressing mode.
1582//
1583// Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.
1584// MemOpFlags contains all the possible flags that can be used to compute the
1585// optimal addressing mode for load and store instructions.
1586// AddrMode contains all the possible load and store addressing modes available
1587// on Power (such as DForm, DSForm, DQForm, XForm, etc.)
1588//
1589// When adding new load and store instructions, it is possible that new address
1590// flags may need to be added into MemOpFlags, and a new addressing mode will
1591// need to be added to AddrMode. An entry of the new addressing mode (consisting
1592// of the minimal and main distinguishing address flags for the new load/store
1593// instructions) will need to be added into initializeAddrModeMap() below.
1594// Finally, when adding new addressing modes, the getAddrModeForFlags() will
1595// need to be updated to account for selecting the optimal addressing mode.
1596// *****************************************************************************
1597/// Initialize the map that relates the different addressing modes of the load
1598/// and store instructions to a set of flags. This ensures the load/store
1599/// instruction is correctly matched during instruction selection.
1600void PPCTargetLowering::initializeAddrModeMap() {
1601 AddrModesMap[PPC::AM_DForm] = {
1602 // LWZ, STW
1603 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,
1604 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,
1605 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1606 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1607 // LBZ, LHZ, STB, STH
1608 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1609 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1610 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1611 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1612 // LHA
1613 PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1614 PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1615 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1616 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1617 // LFS, LFD, STFS, STFD
1618 PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1619 PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1620 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1621 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1622 };
1623 AddrModesMap[PPC::AM_DSForm] = {
1624 // LWA
1625 PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,
1626 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1627 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1628 // LD, STD
1629 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,
1630 PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,
1631 PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,
1632 // DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf64
1633 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1634 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1635 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1636 };
1637 AddrModesMap[PPC::AM_DQForm] = {
1638 // LXV, STXV
1639 PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1640 PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1641 PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1642 };
1643 AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |
1644 PPC::MOF_SubtargetP10};
1645 // TODO: Add mapping for quadword load/store.
1646}
1647
1648/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1649/// the desired ByVal argument alignment.
1650static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
1651 if (MaxAlign == MaxMaxAlign)
1652 return;
1653 if (VectorType *VTy = dyn_cast<VectorType>(Val: Ty)) {
1654 if (MaxMaxAlign >= 32 &&
1655 VTy->getPrimitiveSizeInBits().getFixedValue() >= 256)
1656 MaxAlign = Align(32);
1657 else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 &&
1658 MaxAlign < 16)
1659 MaxAlign = Align(16);
1660 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
1661 Align EltAlign;
1662 getMaxByValAlign(Ty: ATy->getElementType(), MaxAlign&: EltAlign, MaxMaxAlign);
1663 if (EltAlign > MaxAlign)
1664 MaxAlign = EltAlign;
1665 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
1666 for (auto *EltTy : STy->elements()) {
1667 Align EltAlign;
1668 getMaxByValAlign(Ty: EltTy, MaxAlign&: EltAlign, MaxMaxAlign);
1669 if (EltAlign > MaxAlign)
1670 MaxAlign = EltAlign;
1671 if (MaxAlign == MaxMaxAlign)
1672 break;
1673 }
1674 }
1675}
1676
1677/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1678/// function arguments in the caller parameter area.
1679Align PPCTargetLowering::getByValTypeAlignment(Type *Ty,
1680 const DataLayout &DL) const {
1681 // 16byte and wider vectors are passed on 16byte boundary.
1682 // The rest is 8 on PPC64 and 4 on PPC32 boundary.
1683 Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
1684 if (Subtarget.hasAltivec())
1685 getMaxByValAlign(Ty, MaxAlign&: Alignment, MaxMaxAlign: Align(16));
1686 return Alignment;
1687}
1688
1689bool PPCTargetLowering::useSoftFloat() const {
1690 return Subtarget.useSoftFloat();
1691}
1692
1693bool PPCTargetLowering::hasSPE() const {
1694 return Subtarget.hasSPE();
1695}
1696
1697bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
1698 return VT.isScalarInteger();
1699}
1700
1701bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
1702 Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
1703 if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
1704 return false;
1705
1706 if (auto *VTy = dyn_cast<VectorType>(Val: VectorTy)) {
1707 if (VTy->getScalarType()->isIntegerTy()) {
1708 // ElemSizeInBits 8/16 can fit in immediate field, not needed here.
1709 if (ElemSizeInBits == 32) {
1710 Index = Subtarget.isLittleEndian() ? 2 : 1;
1711 return true;
1712 }
1713 if (ElemSizeInBits == 64) {
1714 Index = Subtarget.isLittleEndian() ? 1 : 0;
1715 return true;
1716 }
1717 }
1718 }
1719 return false;
1720}
1721
1722EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
1723 EVT VT) const {
1724 if (!VT.isVector())
1725 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
1726
1727 return VT.changeVectorElementTypeToInteger();
1728}
1729
1730bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1731 assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
1732 return true;
1733}
1734
1735//===----------------------------------------------------------------------===//
1736// Node matching predicates, for use by the tblgen matching code.
1737//===----------------------------------------------------------------------===//
1738
1739/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
1740static bool isFloatingPointZero(SDValue Op) {
1741 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op))
1742 return CFP->getValueAPF().isZero();
1743 else if (ISD::isEXTLoad(N: Op.getNode()) || ISD::isNON_EXTLoad(N: Op.getNode())) {
1744 // Maybe this has already been legalized into the constant pool?
1745 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Val: Op.getOperand(i: 1)))
1746 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: CP->getConstVal()))
1747 return CFP->getValueAPF().isZero();
1748 }
1749 return false;
1750}
1751
1752/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
1753/// true if Op is undef or if it matches the specified value.
1754static bool isConstantOrUndef(int Op, int Val) {
1755 return Op < 0 || Op == Val;
1756}
1757
1758/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
1759/// VPKUHUM instruction.
1760/// The ShuffleKind distinguishes between big-endian operations with
1761/// two different inputs (0), either-endian operations with two identical
1762/// inputs (1), and little-endian operations with two different inputs (2).
1763/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1764bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1765 SelectionDAG &DAG) {
1766 bool IsLE = DAG.getDataLayout().isLittleEndian();
1767 if (ShuffleKind == 0) {
1768 if (IsLE)
1769 return false;
1770 for (unsigned i = 0; i != 16; ++i)
1771 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2+1))
1772 return false;
1773 } else if (ShuffleKind == 2) {
1774 if (!IsLE)
1775 return false;
1776 for (unsigned i = 0; i != 16; ++i)
1777 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2))
1778 return false;
1779 } else if (ShuffleKind == 1) {
1780 unsigned j = IsLE ? 0 : 1;
1781 for (unsigned i = 0; i != 8; ++i)
1782 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2+j) ||
1783 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j))
1784 return false;
1785 }
1786 return true;
1787}
1788
1789/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
1790/// VPKUWUM instruction.
1791/// The ShuffleKind distinguishes between big-endian operations with
1792/// two different inputs (0), either-endian operations with two identical
1793/// inputs (1), and little-endian operations with two different inputs (2).
1794/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1795bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1796 SelectionDAG &DAG) {
1797 bool IsLE = DAG.getDataLayout().isLittleEndian();
1798 if (ShuffleKind == 0) {
1799 if (IsLE)
1800 return false;
1801 for (unsigned i = 0; i != 16; i += 2)
1802 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+2) ||
1803 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+3))
1804 return false;
1805 } else if (ShuffleKind == 2) {
1806 if (!IsLE)
1807 return false;
1808 for (unsigned i = 0; i != 16; i += 2)
1809 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2) ||
1810 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+1))
1811 return false;
1812 } else if (ShuffleKind == 1) {
1813 unsigned j = IsLE ? 0 : 2;
1814 for (unsigned i = 0; i != 8; i += 2)
1815 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+j) ||
1816 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+j+1) ||
1817 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j) ||
1818 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+9), Val: i*2+j+1))
1819 return false;
1820 }
1821 return true;
1822}
1823
1824/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
1825/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
1826/// current subtarget.
1827///
1828/// The ShuffleKind distinguishes between big-endian operations with
1829/// two different inputs (0), either-endian operations with two identical
1830/// inputs (1), and little-endian operations with two different inputs (2).
1831/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1832bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1833 SelectionDAG &DAG) {
1834 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
1835 if (!Subtarget.hasP8Vector())
1836 return false;
1837
1838 bool IsLE = DAG.getDataLayout().isLittleEndian();
1839 if (ShuffleKind == 0) {
1840 if (IsLE)
1841 return false;
1842 for (unsigned i = 0; i != 16; i += 4)
1843 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+4) ||
1844 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+5) ||
1845 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+6) ||
1846 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+7))
1847 return false;
1848 } else if (ShuffleKind == 2) {
1849 if (!IsLE)
1850 return false;
1851 for (unsigned i = 0; i != 16; i += 4)
1852 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2) ||
1853 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+1) ||
1854 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+2) ||
1855 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+3))
1856 return false;
1857 } else if (ShuffleKind == 1) {
1858 unsigned j = IsLE ? 0 : 4;
1859 for (unsigned i = 0; i != 8; i += 4)
1860 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+j) ||
1861 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+j+1) ||
1862 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+j+2) ||
1863 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+j+3) ||
1864 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j) ||
1865 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+9), Val: i*2+j+1) ||
1866 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+10), Val: i*2+j+2) ||
1867 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+11), Val: i*2+j+3))
1868 return false;
1869 }
1870 return true;
1871}
1872
1873/// isVMerge - Common function, used to match vmrg* shuffles.
1874///
1875static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
1876 unsigned LHSStart, unsigned RHSStart) {
1877 if (N->getValueType(ResNo: 0) != MVT::v16i8)
1878 return false;
1879 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
1880 "Unsupported merge size!");
1881
1882 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
1883 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
1884 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i*UnitSize*2+j),
1885 Val: LHSStart+j+i*UnitSize) ||
1886 !isConstantOrUndef(Op: N->getMaskElt(Idx: i*UnitSize*2+UnitSize+j),
1887 Val: RHSStart+j+i*UnitSize))
1888 return false;
1889 }
1890 return true;
1891}
1892
1893/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
1894/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
1895/// The ShuffleKind distinguishes between big-endian merges with two
1896/// different inputs (0), either-endian merges with two identical inputs (1),
1897/// and little-endian merges with two different inputs (2). For the latter,
1898/// the input operands are swapped (see PPCInstrAltivec.td).
1899bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1900 unsigned ShuffleKind, SelectionDAG &DAG) {
1901 if (DAG.getDataLayout().isLittleEndian()) {
1902 if (ShuffleKind == 1) // unary
1903 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 0);
1904 else if (ShuffleKind == 2) // swapped
1905 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 16);
1906 else
1907 return false;
1908 } else {
1909 if (ShuffleKind == 1) // unary
1910 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 8);
1911 else if (ShuffleKind == 0) // normal
1912 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 24);
1913 else
1914 return false;
1915 }
1916}
1917
1918/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
1919/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
1920/// The ShuffleKind distinguishes between big-endian merges with two
1921/// different inputs (0), either-endian merges with two identical inputs (1),
1922/// and little-endian merges with two different inputs (2). For the latter,
1923/// the input operands are swapped (see PPCInstrAltivec.td).
1924bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1925 unsigned ShuffleKind, SelectionDAG &DAG) {
1926 if (DAG.getDataLayout().isLittleEndian()) {
1927 if (ShuffleKind == 1) // unary
1928 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 8);
1929 else if (ShuffleKind == 2) // swapped
1930 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 24);
1931 else
1932 return false;
1933 } else {
1934 if (ShuffleKind == 1) // unary
1935 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 0);
1936 else if (ShuffleKind == 0) // normal
1937 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 16);
1938 else
1939 return false;
1940 }
1941}
1942
1943/**
1944 * Common function used to match vmrgew and vmrgow shuffles
1945 *
1946 * The indexOffset determines whether to look for even or odd words in
1947 * the shuffle mask. This is based on the of the endianness of the target
1948 * machine.
1949 * - Little Endian:
1950 * - Use offset of 0 to check for odd elements
1951 * - Use offset of 4 to check for even elements
1952 * - Big Endian:
1953 * - Use offset of 0 to check for even elements
1954 * - Use offset of 4 to check for odd elements
1955 * A detailed description of the vector element ordering for little endian and
1956 * big endian can be found at
1957 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
1958 * Targeting your applications - what little endian and big endian IBM XL C/C++
1959 * compiler differences mean to you
1960 *
1961 * The mask to the shuffle vector instruction specifies the indices of the
1962 * elements from the two input vectors to place in the result. The elements are
1963 * numbered in array-access order, starting with the first vector. These vectors
1964 * are always of type v16i8, thus each vector will contain 16 elements of size
1965 * 8. More info on the shuffle vector can be found in the
1966 * http://llvm.org/docs/LangRef.html#shufflevector-instruction
1967 * Language Reference.
1968 *
1969 * The RHSStartValue indicates whether the same input vectors are used (unary)
1970 * or two different input vectors are used, based on the following:
1971 * - If the instruction uses the same vector for both inputs, the range of the
1972 * indices will be 0 to 15. In this case, the RHSStart value passed should
1973 * be 0.
1974 * - If the instruction has two different vectors then the range of the
1975 * indices will be 0 to 31. In this case, the RHSStart value passed should
1976 * be 16 (indices 0-15 specify elements in the first vector while indices 16
1977 * to 31 specify elements in the second vector).
1978 *
1979 * \param[in] N The shuffle vector SD Node to analyze
1980 * \param[in] IndexOffset Specifies whether to look for even or odd elements
1981 * \param[in] RHSStartValue Specifies the starting index for the righthand input
1982 * vector to the shuffle_vector instruction
1983 * \return true iff this shuffle vector represents an even or odd word merge
1984 */
1985static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
1986 unsigned RHSStartValue) {
1987 if (N->getValueType(ResNo: 0) != MVT::v16i8)
1988 return false;
1989
1990 for (unsigned i = 0; i < 2; ++i)
1991 for (unsigned j = 0; j < 4; ++j)
1992 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i*4+j),
1993 Val: i*RHSStartValue+j+IndexOffset) ||
1994 !isConstantOrUndef(Op: N->getMaskElt(Idx: i*4+j+8),
1995 Val: i*RHSStartValue+j+IndexOffset+8))
1996 return false;
1997 return true;
1998}
1999
2000/**
2001 * Determine if the specified shuffle mask is suitable for the vmrgew or
2002 * vmrgow instructions.
2003 *
2004 * \param[in] N The shuffle vector SD Node to analyze
2005 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
2006 * \param[in] ShuffleKind Identify the type of merge:
2007 * - 0 = big-endian merge with two different inputs;
2008 * - 1 = either-endian merge with two identical inputs;
2009 * - 2 = little-endian merge with two different inputs (inputs are swapped for
2010 * little-endian merges).
2011 * \param[in] DAG The current SelectionDAG
2012 * \return true iff this shuffle mask
2013 */
2014bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
2015 unsigned ShuffleKind, SelectionDAG &DAG) {
2016 if (DAG.getDataLayout().isLittleEndian()) {
2017 unsigned indexOffset = CheckEven ? 4 : 0;
2018 if (ShuffleKind == 1) // Unary
2019 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 0);
2020 else if (ShuffleKind == 2) // swapped
2021 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 16);
2022 else
2023 return false;
2024 }
2025 else {
2026 unsigned indexOffset = CheckEven ? 0 : 4;
2027 if (ShuffleKind == 1) // Unary
2028 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 0);
2029 else if (ShuffleKind == 0) // Normal
2030 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 16);
2031 else
2032 return false;
2033 }
2034 return false;
2035}
2036
2037/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
2038/// amount, otherwise return -1.
2039/// The ShuffleKind distinguishes between big-endian operations with two
2040/// different inputs (0), either-endian operations with two identical inputs
2041/// (1), and little-endian operations with two different inputs (2). For the
2042/// latter, the input operands are swapped (see PPCInstrAltivec.td).
2043int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
2044 SelectionDAG &DAG) {
2045 if (N->getValueType(ResNo: 0) != MVT::v16i8)
2046 return -1;
2047
2048 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val: N);
2049
2050 // Find the first non-undef value in the shuffle mask.
2051 unsigned i;
2052 for (i = 0; i != 16 && SVOp->getMaskElt(Idx: i) < 0; ++i)
2053 /*search*/;
2054
2055 if (i == 16) return -1; // all undef.
2056
2057 // Otherwise, check to see if the rest of the elements are consecutively
2058 // numbered from this value.
2059 unsigned ShiftAmt = SVOp->getMaskElt(Idx: i);
2060 if (ShiftAmt < i) return -1;
2061
2062 ShiftAmt -= i;
2063 bool isLE = DAG.getDataLayout().isLittleEndian();
2064
2065 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
2066 // Check the rest of the elements to see if they are consecutive.
2067 for (++i; i != 16; ++i)
2068 if (!isConstantOrUndef(Op: SVOp->getMaskElt(Idx: i), Val: ShiftAmt+i))
2069 return -1;
2070 } else if (ShuffleKind == 1) {
2071 // Check the rest of the elements to see if they are consecutive.
2072 for (++i; i != 16; ++i)
2073 if (!isConstantOrUndef(Op: SVOp->getMaskElt(Idx: i), Val: (ShiftAmt+i) & 15))
2074 return -1;
2075 } else
2076 return -1;
2077
2078 if (isLE)
2079 ShiftAmt = 16 - ShiftAmt;
2080
2081 return ShiftAmt;
2082}
2083
2084/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
2085/// specifies a splat of a single element that is suitable for input to
2086/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
2087bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
2088 EVT VT = N->getValueType(ResNo: 0);
2089 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2090 return EltSize == 8 && N->getMaskElt(Idx: 0) == N->getMaskElt(Idx: 1);
2091
2092 assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&
2093 EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
2094
2095 // The consecutive indices need to specify an element, not part of two
2096 // different elements. So abandon ship early if this isn't the case.
2097 if (N->getMaskElt(Idx: 0) % EltSize != 0)
2098 return false;
2099
2100 // This is a splat operation if each element of the permute is the same, and
2101 // if the value doesn't reference the second vector.
2102 unsigned ElementBase = N->getMaskElt(Idx: 0);
2103
2104 // FIXME: Handle UNDEF elements too!
2105 if (ElementBase >= 16)
2106 return false;
2107
2108 // Check that the indices are consecutive, in the case of a multi-byte element
2109 // splatted with a v16i8 mask.
2110 for (unsigned i = 1; i != EltSize; ++i)
2111 if (N->getMaskElt(Idx: i) < 0 || N->getMaskElt(Idx: i) != (int)(i+ElementBase))
2112 return false;
2113
2114 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
2115 // An UNDEF element is a sequence of UNDEF bytes.
2116 if (N->getMaskElt(Idx: i) < 0) {
2117 for (unsigned j = 1; j != EltSize; ++j)
2118 if (N->getMaskElt(Idx: i + j) >= 0)
2119 return false;
2120 } else
2121 for (unsigned j = 0; j != EltSize; ++j)
2122 if (N->getMaskElt(Idx: i + j) != N->getMaskElt(Idx: j))
2123 return false;
2124 }
2125 return true;
2126}
2127
2128/// Check that the mask is shuffling N byte elements. Within each N byte
2129/// element of the mask, the indices could be either in increasing or
2130/// decreasing order as long as they are consecutive.
2131/// \param[in] N the shuffle vector SD Node to analyze
2132/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
2133/// Word/DoubleWord/QuadWord).
2134/// \param[in] StepLen the delta indices number among the N byte element, if
2135/// the mask is in increasing/decreasing order then it is 1/-1.
2136/// \return true iff the mask is shuffling N byte elements.
2137static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
2138 int StepLen) {
2139 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
2140 "Unexpected element width.");
2141 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
2142
2143 unsigned NumOfElem = 16 / Width;
2144 unsigned MaskVal[16]; // Width is never greater than 16
2145 for (unsigned i = 0; i < NumOfElem; ++i) {
2146 MaskVal[0] = N->getMaskElt(Idx: i * Width);
2147 if ((StepLen == 1) && (MaskVal[0] % Width)) {
2148 return false;
2149 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
2150 return false;
2151 }
2152
2153 for (unsigned int j = 1; j < Width; ++j) {
2154 MaskVal[j] = N->getMaskElt(Idx: i * Width + j);
2155 if (MaskVal[j] != MaskVal[j-1] + StepLen) {
2156 return false;
2157 }
2158 }
2159 }
2160
2161 return true;
2162}
2163
2164bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2165 unsigned &InsertAtByte, bool &Swap, bool IsLE) {
2166 if (!isNByteElemShuffleMask(N, Width: 4, StepLen: 1))
2167 return false;
2168
2169 // Now we look at mask elements 0,4,8,12
2170 unsigned M0 = N->getMaskElt(Idx: 0) / 4;
2171 unsigned M1 = N->getMaskElt(Idx: 4) / 4;
2172 unsigned M2 = N->getMaskElt(Idx: 8) / 4;
2173 unsigned M3 = N->getMaskElt(Idx: 12) / 4;
2174 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
2175 unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
2176
2177 // Below, let H and L be arbitrary elements of the shuffle mask
2178 // where H is in the range [4,7] and L is in the range [0,3].
2179 // H, 1, 2, 3 or L, 5, 6, 7
2180 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
2181 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
2182 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
2183 InsertAtByte = IsLE ? 12 : 0;
2184 Swap = M0 < 4;
2185 return true;
2186 }
2187 // 0, H, 2, 3 or 4, L, 6, 7
2188 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
2189 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
2190 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
2191 InsertAtByte = IsLE ? 8 : 4;
2192 Swap = M1 < 4;
2193 return true;
2194 }
2195 // 0, 1, H, 3 or 4, 5, L, 7
2196 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
2197 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
2198 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
2199 InsertAtByte = IsLE ? 4 : 8;
2200 Swap = M2 < 4;
2201 return true;
2202 }
2203 // 0, 1, 2, H or 4, 5, 6, L
2204 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
2205 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
2206 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
2207 InsertAtByte = IsLE ? 0 : 12;
2208 Swap = M3 < 4;
2209 return true;
2210 }
2211
2212 // If both vector operands for the shuffle are the same vector, the mask will
2213 // contain only elements from the first one and the second one will be undef.
2214 if (N->getOperand(Num: 1).isUndef()) {
2215 ShiftElts = 0;
2216 Swap = true;
2217 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
2218 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
2219 InsertAtByte = IsLE ? 12 : 0;
2220 return true;
2221 }
2222 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
2223 InsertAtByte = IsLE ? 8 : 4;
2224 return true;
2225 }
2226 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
2227 InsertAtByte = IsLE ? 4 : 8;
2228 return true;
2229 }
2230 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
2231 InsertAtByte = IsLE ? 0 : 12;
2232 return true;
2233 }
2234 }
2235
2236 return false;
2237}
2238
2239bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2240 bool &Swap, bool IsLE) {
2241 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2242 // Ensure each byte index of the word is consecutive.
2243 if (!isNByteElemShuffleMask(N, Width: 4, StepLen: 1))
2244 return false;
2245
2246 // Now we look at mask elements 0,4,8,12, which are the beginning of words.
2247 unsigned M0 = N->getMaskElt(Idx: 0) / 4;
2248 unsigned M1 = N->getMaskElt(Idx: 4) / 4;
2249 unsigned M2 = N->getMaskElt(Idx: 8) / 4;
2250 unsigned M3 = N->getMaskElt(Idx: 12) / 4;
2251
2252 // If both vector operands for the shuffle are the same vector, the mask will
2253 // contain only elements from the first one and the second one will be undef.
2254 if (N->getOperand(Num: 1).isUndef()) {
2255 assert(M0 < 4 && "Indexing into an undef vector?");
2256 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
2257 return false;
2258
2259 ShiftElts = IsLE ? (4 - M0) % 4 : M0;
2260 Swap = false;
2261 return true;
2262 }
2263
2264 // Ensure each word index of the ShuffleVector Mask is consecutive.
2265 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
2266 return false;
2267
2268 if (IsLE) {
2269 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
2270 // Input vectors don't need to be swapped if the leading element
2271 // of the result is one of the 3 left elements of the second vector
2272 // (or if there is no shift to be done at all).
2273 Swap = false;
2274 ShiftElts = (8 - M0) % 8;
2275 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
2276 // Input vectors need to be swapped if the leading element
2277 // of the result is one of the 3 left elements of the first vector
2278 // (or if we're shifting by 4 - thereby simply swapping the vectors).
2279 Swap = true;
2280 ShiftElts = (4 - M0) % 4;
2281 }
2282
2283 return true;
2284 } else { // BE
2285 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
2286 // Input vectors don't need to be swapped if the leading element
2287 // of the result is one of the 4 elements of the first vector.
2288 Swap = false;
2289 ShiftElts = M0;
2290 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
2291 // Input vectors need to be swapped if the leading element
2292 // of the result is one of the 4 elements of the right vector.
2293 Swap = true;
2294 ShiftElts = M0 - 4;
2295 }
2296
2297 return true;
2298 }
2299}
2300
2301bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
2302 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2303
2304 if (!isNByteElemShuffleMask(N, Width, StepLen: -1))
2305 return false;
2306
2307 for (int i = 0; i < 16; i += Width)
2308 if (N->getMaskElt(Idx: i) != i + Width - 1)
2309 return false;
2310
2311 return true;
2312}
2313
2314bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
2315 return isXXBRShuffleMaskHelper(N, Width: 2);
2316}
2317
2318bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
2319 return isXXBRShuffleMaskHelper(N, Width: 4);
2320}
2321
2322bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
2323 return isXXBRShuffleMaskHelper(N, Width: 8);
2324}
2325
2326bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
2327 return isXXBRShuffleMaskHelper(N, Width: 16);
2328}
2329
2330/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
2331/// if the inputs to the instruction should be swapped and set \p DM to the
2332/// value for the immediate.
2333/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
2334/// AND element 0 of the result comes from the first input (LE) or second input
2335/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
2336/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
2337/// mask.
2338bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
2339 bool &Swap, bool IsLE) {
2340 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2341
2342 // Ensure each byte index of the double word is consecutive.
2343 if (!isNByteElemShuffleMask(N, Width: 8, StepLen: 1))
2344 return false;
2345
2346 unsigned M0 = N->getMaskElt(Idx: 0) / 8;
2347 unsigned M1 = N->getMaskElt(Idx: 8) / 8;
2348 assert(((M0 | M1) < 4) && "A mask element out of bounds?");
2349
2350 // If both vector operands for the shuffle are the same vector, the mask will
2351 // contain only elements from the first one and the second one will be undef.
2352 if (N->getOperand(Num: 1).isUndef()) {
2353 if ((M0 | M1) < 2) {
2354 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
2355 Swap = false;
2356 return true;
2357 } else
2358 return false;
2359 }
2360
2361 if (IsLE) {
2362 if (M0 > 1 && M1 < 2) {
2363 Swap = false;
2364 } else if (M0 < 2 && M1 > 1) {
2365 M0 = (M0 + 2) % 4;
2366 M1 = (M1 + 2) % 4;
2367 Swap = true;
2368 } else
2369 return false;
2370
2371 // Note: if control flow comes here that means Swap is already set above
2372 DM = (((~M1) & 1) << 1) + ((~M0) & 1);
2373 return true;
2374 } else { // BE
2375 if (M0 < 2 && M1 > 1) {
2376 Swap = false;
2377 } else if (M0 > 1 && M1 < 2) {
2378 M0 = (M0 + 2) % 4;
2379 M1 = (M1 + 2) % 4;
2380 Swap = true;
2381 } else
2382 return false;
2383
2384 // Note: if control flow comes here that means Swap is already set above
2385 DM = (M0 << 1) + (M1 & 1);
2386 return true;
2387 }
2388}
2389
2390
2391/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
2392/// appropriate for PPC mnemonics (which have a big endian bias - namely
2393/// elements are counted from the left of the vector register).
2394unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
2395 SelectionDAG &DAG) {
2396 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val: N);
2397 assert(isSplatShuffleMask(SVOp, EltSize));
2398 EVT VT = SVOp->getValueType(ResNo: 0);
2399
2400 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2401 return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(Idx: 0)
2402 : SVOp->getMaskElt(Idx: 0);
2403
2404 if (DAG.getDataLayout().isLittleEndian())
2405 return (16 / EltSize) - 1 - (SVOp->getMaskElt(Idx: 0) / EltSize);
2406 else
2407 return SVOp->getMaskElt(Idx: 0) / EltSize;
2408}
2409
2410/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
2411/// by using a vspltis[bhw] instruction of the specified element size, return
2412/// the constant being splatted. The ByteSize field indicates the number of
2413/// bytes of each element [124] -> [bhw].
2414SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
2415 SDValue OpVal;
2416
2417 // If ByteSize of the splat is bigger than the element size of the
2418 // build_vector, then we have a case where we are checking for a splat where
2419 // multiple elements of the buildvector are folded together into a single
2420 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
2421 unsigned EltSize = 16/N->getNumOperands();
2422 if (EltSize < ByteSize) {
2423 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
2424 SDValue UniquedVals[4];
2425 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
2426
2427 // See if all of the elements in the buildvector agree across.
2428 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2429 if (N->getOperand(Num: i).isUndef()) continue;
2430 // If the element isn't a constant, bail fully out.
2431 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: i))) return SDValue();
2432
2433 if (!UniquedVals[i&(Multiple-1)].getNode())
2434 UniquedVals[i&(Multiple-1)] = N->getOperand(Num: i);
2435 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(Num: i))
2436 return SDValue(); // no match.
2437 }
2438
2439 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
2440 // either constant or undef values that are identical for each chunk. See
2441 // if these chunks can form into a larger vspltis*.
2442
2443 // Check to see if all of the leading entries are either 0 or -1. If
2444 // neither, then this won't fit into the immediate field.
2445 bool LeadingZero = true;
2446 bool LeadingOnes = true;
2447 for (unsigned i = 0; i != Multiple-1; ++i) {
2448 if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
2449
2450 LeadingZero &= isNullConstant(V: UniquedVals[i]);
2451 LeadingOnes &= isAllOnesConstant(V: UniquedVals[i]);
2452 }
2453 // Finally, check the least significant entry.
2454 if (LeadingZero) {
2455 if (!UniquedVals[Multiple-1].getNode())
2456 return DAG.getTargetConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32); // 0,0,0,undef
2457 int Val = UniquedVals[Multiple - 1]->getAsZExtVal();
2458 if (Val < 16) // 0,0,0,4 -> vspltisw(4)
2459 return DAG.getTargetConstant(Val, DL: SDLoc(N), VT: MVT::i32);
2460 }
2461 if (LeadingOnes) {
2462 if (!UniquedVals[Multiple-1].getNode())
2463 return DAG.getTargetConstant(Val: ~0U, DL: SDLoc(N), VT: MVT::i32); // -1,-1,-1,undef
2464 int Val =cast<ConstantSDNode>(Val&: UniquedVals[Multiple-1])->getSExtValue();
2465 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
2466 return DAG.getTargetConstant(Val, DL: SDLoc(N), VT: MVT::i32);
2467 }
2468
2469 return SDValue();
2470 }
2471
2472 // Check to see if this buildvec has a single non-undef value in its elements.
2473 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2474 if (N->getOperand(Num: i).isUndef()) continue;
2475 if (!OpVal.getNode())
2476 OpVal = N->getOperand(Num: i);
2477 else if (OpVal != N->getOperand(Num: i))
2478 return SDValue();
2479 }
2480
2481 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
2482
2483 unsigned ValSizeInBytes = EltSize;
2484 uint64_t Value = 0;
2485 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: OpVal)) {
2486 Value = CN->getZExtValue();
2487 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Val&: OpVal)) {
2488 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
2489 Value = llvm::bit_cast<uint32_t>(from: CN->getValueAPF().convertToFloat());
2490 }
2491
2492 // If the splat value is larger than the element value, then we can never do
2493 // this splat. The only case that we could fit the replicated bits into our
2494 // immediate field for would be zero, and we prefer to use vxor for it.
2495 if (ValSizeInBytes < ByteSize) return SDValue();
2496
2497 // If the element value is larger than the splat value, check if it consists
2498 // of a repeated bit pattern of size ByteSize.
2499 if (!APInt(ValSizeInBytes * 8, Value).isSplat(SplatSizeInBits: ByteSize * 8))
2500 return SDValue();
2501
2502 // Properly sign extend the value.
2503 int MaskVal = SignExtend32(X: Value, B: ByteSize * 8);
2504
2505 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
2506 if (MaskVal == 0) return SDValue();
2507
2508 // Finally, if this value fits in a 5 bit sext field, return it
2509 if (SignExtend32<5>(X: MaskVal) == MaskVal)
2510 return DAG.getSignedTargetConstant(Val: MaskVal, DL: SDLoc(N), VT: MVT::i32);
2511 return SDValue();
2512}
2513
2514//===----------------------------------------------------------------------===//
2515// Addressing Mode Selection
2516//===----------------------------------------------------------------------===//
2517
2518/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
2519/// or 64-bit immediate, and if the value can be accurately represented as a
2520/// sign extension from a 16-bit value. If so, this returns true and the
2521/// immediate.
2522bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
2523 if (!isa<ConstantSDNode>(Val: N))
2524 return false;
2525
2526 Imm = (int16_t)N->getAsZExtVal();
2527 if (N->getValueType(ResNo: 0) == MVT::i32)
2528 return Imm == (int32_t)N->getAsZExtVal();
2529 else
2530 return Imm == (int64_t)N->getAsZExtVal();
2531}
2532bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
2533 return isIntS16Immediate(N: Op.getNode(), Imm);
2534}
2535
2536/// Used when computing address flags for selecting loads and stores.
2537/// If we have an OR, check if the LHS and RHS are provably disjoint.
2538/// An OR of two provably disjoint values is equivalent to an ADD.
2539/// Most PPC load/store instructions compute the effective address as a sum,
2540/// so doing this conversion is useful.
2541static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
2542 if (N.getOpcode() != ISD::OR)
2543 return false;
2544 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2545 if (!LHSKnown.Zero.getBoolValue())
2546 return false;
2547 KnownBits RHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 1));
2548 return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);
2549}
2550
2551/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
2552/// be represented as an indexed [r+r] operation.
2553bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
2554 SDValue &Index,
2555 SelectionDAG &DAG) const {
2556 for (SDNode *U : N->users()) {
2557 if (MemSDNode *Memop = dyn_cast<MemSDNode>(Val: U)) {
2558 if (Memop->getMemoryVT() == MVT::f64) {
2559 Base = N.getOperand(i: 0);
2560 Index = N.getOperand(i: 1);
2561 return true;
2562 }
2563 }
2564 }
2565 return false;
2566}
2567
2568/// isIntS34Immediate - This method tests if value of node given can be
2569/// accurately represented as a sign extension from a 34-bit value. If so,
2570/// this returns true and the immediate.
2571bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
2572 if (!isa<ConstantSDNode>(Val: N))
2573 return false;
2574
2575 Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
2576 return isInt<34>(x: Imm);
2577}
2578bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
2579 return isIntS34Immediate(N: Op.getNode(), Imm);
2580}
2581
2582/// SelectAddressRegReg - Given the specified addressed, check to see if it
2583/// can be represented as an indexed [r+r] operation. Returns false if it
2584/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
2585/// non-zero and N can be represented by a base register plus a signed 16-bit
2586/// displacement, make a more precise judgement by checking (displacement % \p
2587/// EncodingAlignment).
2588bool PPCTargetLowering::SelectAddressRegReg(
2589 SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
2590 MaybeAlign EncodingAlignment) const {
2591 // If we have a PC Relative target flag don't select as [reg+reg]. It will be
2592 // a [pc+imm].
2593 if (SelectAddressPCRel(N, Base))
2594 return false;
2595
2596 int16_t Imm = 0;
2597 if (N.getOpcode() == ISD::ADD) {
2598 // Is there any SPE load/store (f64), which can't handle 16bit offset?
2599 // SPE load/store can only handle 8-bit offsets.
2600 if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
2601 return true;
2602 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm) &&
2603 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm)))
2604 return false; // r+i
2605 if (N.getOperand(i: 1).getOpcode() == PPCISD::Lo)
2606 return false; // r+i
2607
2608 Base = N.getOperand(i: 0);
2609 Index = N.getOperand(i: 1);
2610 return true;
2611 } else if (N.getOpcode() == ISD::OR) {
2612 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm) &&
2613 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm)))
2614 return false; // r+i can fold it if we can.
2615
2616 // If this is an or of disjoint bitfields, we can codegen this as an add
2617 // (for better address arithmetic) if the LHS and RHS of the OR are provably
2618 // disjoint.
2619 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2620
2621 if (LHSKnown.Zero.getBoolValue()) {
2622 KnownBits RHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 1));
2623 // If all of the bits are known zero on the LHS or RHS, the add won't
2624 // carry.
2625 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
2626 Base = N.getOperand(i: 0);
2627 Index = N.getOperand(i: 1);
2628 return true;
2629 }
2630 }
2631 }
2632
2633 return false;
2634}
2635
2636// If we happen to be doing an i64 load or store into a stack slot that has
2637// less than a 4-byte alignment, then the frame-index elimination may need to
2638// use an indexed load or store instruction (because the offset may not be a
2639// multiple of 4). The extra register needed to hold the offset comes from the
2640// register scavenger, and it is possible that the scavenger will need to use
2641// an emergency spill slot. As a result, we need to make sure that a spill slot
2642// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
2643// stack slot.
2644static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
2645 // FIXME: This does not handle the LWA case.
2646 if (VT != MVT::i64)
2647 return;
2648
2649 // NOTE: We'll exclude negative FIs here, which come from argument
2650 // lowering, because there are no known test cases triggering this problem
2651 // using packed structures (or similar). We can remove this exclusion if
2652 // we find such a test case. The reason why this is so test-case driven is
2653 // because this entire 'fixup' is only to prevent crashes (from the
2654 // register scavenger) on not-really-valid inputs. For example, if we have:
2655 // %a = alloca i1
2656 // %b = bitcast i1* %a to i64*
2657 // store i64* a, i64 b
2658 // then the store should really be marked as 'align 1', but is not. If it
2659 // were marked as 'align 1' then the indexed form would have been
2660 // instruction-selected initially, and the problem this 'fixup' is preventing
2661 // won't happen regardless.
2662 if (FrameIdx < 0)
2663 return;
2664
2665 MachineFunction &MF = DAG.getMachineFunction();
2666 MachineFrameInfo &MFI = MF.getFrameInfo();
2667
2668 if (MFI.getObjectAlign(ObjectIdx: FrameIdx) >= Align(4))
2669 return;
2670
2671 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
2672 FuncInfo->setHasNonRISpills();
2673}
2674
2675/// Returns true if the address N can be represented by a base register plus
2676/// a signed 16-bit displacement [r+imm], and if it is not better
2677/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
2678/// displacements that are multiples of that value.
2679bool PPCTargetLowering::SelectAddressRegImm(
2680 SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
2681 MaybeAlign EncodingAlignment) const {
2682 // FIXME dl should come from parent load or store, not from address
2683 SDLoc dl(N);
2684
2685 // If we have a PC Relative target flag don't select as [reg+imm]. It will be
2686 // a [pc+imm].
2687 if (SelectAddressPCRel(N, Base))
2688 return false;
2689
2690 // If this can be more profitably realized as r+r, fail.
2691 if (SelectAddressRegReg(N, Base&: Disp, Index&: Base, DAG, EncodingAlignment))
2692 return false;
2693
2694 if (N.getOpcode() == ISD::ADD) {
2695 int16_t imm = 0;
2696 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) &&
2697 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: imm))) {
2698 Disp = DAG.getSignedTargetConstant(Val: imm, DL: dl, VT: N.getValueType());
2699 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0))) {
2700 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2701 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2702 } else {
2703 Base = N.getOperand(i: 0);
2704 }
2705 return true; // [r+i]
2706 } else if (N.getOperand(i: 1).getOpcode() == PPCISD::Lo) {
2707 // Match LOAD (ADD (X, Lo(G))).
2708 assert(!N.getOperand(1).getConstantOperandVal(1) &&
2709 "Cannot handle constant offsets yet!");
2710 Disp = N.getOperand(i: 1).getOperand(i: 0); // The global address.
2711 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
2712 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
2713 Disp.getOpcode() == ISD::TargetConstantPool ||
2714 Disp.getOpcode() == ISD::TargetJumpTable);
2715 Base = N.getOperand(i: 0);
2716 return true; // [&g+r]
2717 }
2718 } else if (N.getOpcode() == ISD::OR) {
2719 int16_t imm = 0;
2720 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) &&
2721 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: imm))) {
2722 // If this is an or of disjoint bitfields, we can codegen this as an add
2723 // (for better address arithmetic) if the LHS and RHS of the OR are
2724 // provably disjoint.
2725 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2726
2727 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
2728 // If all of the bits are known zero on the LHS or RHS, the add won't
2729 // carry.
2730 if (FrameIndexSDNode *FI =
2731 dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0))) {
2732 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2733 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2734 } else {
2735 Base = N.getOperand(i: 0);
2736 }
2737 Disp = DAG.getTargetConstant(Val: imm, DL: dl, VT: N.getValueType());
2738 return true;
2739 }
2740 }
2741 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: N)) {
2742 // Loading from a constant address.
2743
2744 // If this address fits entirely in a 16-bit sext immediate field, codegen
2745 // this as "d, 0"
2746 int16_t Imm;
2747 if (isIntS16Immediate(N: CN, Imm) &&
2748 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm))) {
2749 Disp = DAG.getTargetConstant(Val: Imm, DL: dl, VT: CN->getValueType(ResNo: 0));
2750 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2751 VT: CN->getValueType(ResNo: 0));
2752 return true;
2753 }
2754
2755 // Handle 32-bit sext immediates with LIS + addr mode.
2756 if ((CN->getValueType(ResNo: 0) == MVT::i32 ||
2757 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
2758 (!EncodingAlignment ||
2759 isAligned(Lhs: *EncodingAlignment, SizeInBytes: CN->getZExtValue()))) {
2760 int Addr = (int)CN->getZExtValue();
2761
2762 // Otherwise, break this down into an LIS + disp.
2763 Disp = DAG.getTargetConstant(Val: (short)Addr, DL: dl, VT: MVT::i32);
2764
2765 Base = DAG.getTargetConstant(Val: (Addr - (signed short)Addr) >> 16, DL: dl,
2766 VT: MVT::i32);
2767 unsigned Opc = CN->getValueType(ResNo: 0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
2768 Base = SDValue(DAG.getMachineNode(Opcode: Opc, dl, VT: CN->getValueType(ResNo: 0), Op1: Base), 0);
2769 return true;
2770 }
2771 }
2772
2773 Disp = DAG.getTargetConstant(Val: 0, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()));
2774 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N)) {
2775 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2776 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2777 } else
2778 Base = N;
2779 return true; // [r+0]
2780}
2781
2782/// Similar to the 16-bit case but for instructions that take a 34-bit
2783/// displacement field (prefixed loads/stores).
2784bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
2785 SDValue &Base,
2786 SelectionDAG &DAG) const {
2787 // Only on 64-bit targets.
2788 if (N.getValueType() != MVT::i64)
2789 return false;
2790
2791 SDLoc dl(N);
2792 int64_t Imm = 0;
2793
2794 if (N.getOpcode() == ISD::ADD) {
2795 if (!isIntS34Immediate(Op: N.getOperand(i: 1), Imm))
2796 return false;
2797 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2798 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
2799 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2800 else
2801 Base = N.getOperand(i: 0);
2802 return true;
2803 }
2804
2805 if (N.getOpcode() == ISD::OR) {
2806 if (!isIntS34Immediate(Op: N.getOperand(i: 1), Imm))
2807 return false;
2808 // If this is an or of disjoint bitfields, we can codegen this as an add
2809 // (for better address arithmetic) if the LHS and RHS of the OR are
2810 // provably disjoint.
2811 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2812 if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
2813 return false;
2814 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
2815 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2816 else
2817 Base = N.getOperand(i: 0);
2818 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2819 return true;
2820 }
2821
2822 if (isIntS34Immediate(Op: N, Imm)) { // If the address is a 34-bit const.
2823 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2824 Base = DAG.getRegister(Reg: PPC::ZERO8, VT: N.getValueType());
2825 return true;
2826 }
2827
2828 return false;
2829}
2830
2831/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
2832/// represented as an indexed [r+r] operation.
2833bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
2834 SDValue &Index,
2835 SelectionDAG &DAG) const {
2836 // Check to see if we can easily represent this as an [r+r] address. This
2837 // will fail if it thinks that the address is more profitably represented as
2838 // reg+imm, e.g. where imm = 0.
2839 if (SelectAddressRegReg(N, Base, Index, DAG))
2840 return true;
2841
2842 // If the address is the result of an add, we will utilize the fact that the
2843 // address calculation includes an implicit add. However, we can reduce
2844 // register pressure if we do not materialize a constant just for use as the
2845 // index register. We only get rid of the add if it is not an add of a
2846 // value and a 16-bit signed constant and both have a single use.
2847 int16_t imm = 0;
2848 if (N.getOpcode() == ISD::ADD &&
2849 (!isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) ||
2850 !N.getOperand(i: 1).hasOneUse() || !N.getOperand(i: 0).hasOneUse())) {
2851 Base = N.getOperand(i: 0);
2852 Index = N.getOperand(i: 1);
2853 return true;
2854 }
2855
2856 // Otherwise, do it the hard way, using R0 as the base register.
2857 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2858 VT: N.getValueType());
2859 Index = N;
2860 return true;
2861}
2862
2863template <typename Ty> static bool isValidPCRelNode(SDValue N) {
2864 Ty *PCRelCand = dyn_cast<Ty>(N);
2865 return PCRelCand && (PPCInstrInfo::hasPCRelFlag(TF: PCRelCand->getTargetFlags()));
2866}
2867
2868/// Returns true if this address is a PC Relative address.
2869/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
2870/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
2871bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
2872 // This is a materialize PC Relative node. Always select this as PC Relative.
2873 Base = N;
2874 if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
2875 return true;
2876 if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
2877 isValidPCRelNode<GlobalAddressSDNode>(N) ||
2878 isValidPCRelNode<JumpTableSDNode>(N) ||
2879 isValidPCRelNode<BlockAddressSDNode>(N))
2880 return true;
2881 return false;
2882}
2883
2884/// Returns true if we should use a direct load into vector instruction
2885/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
2886static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
2887
2888 // If there are any other uses other than scalar to vector, then we should
2889 // keep it as a scalar load -> direct move pattern to prevent multiple
2890 // loads.
2891 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N);
2892 if (!LD)
2893 return false;
2894
2895 EVT MemVT = LD->getMemoryVT();
2896 if (!MemVT.isSimple())
2897 return false;
2898 switch(MemVT.getSimpleVT().SimpleTy) {
2899 case MVT::i64:
2900 break;
2901 case MVT::i32:
2902 if (!ST.hasP8Vector())
2903 return false;
2904 break;
2905 case MVT::i16:
2906 case MVT::i8:
2907 if (!ST.hasP9Vector())
2908 return false;
2909 break;
2910 default:
2911 return false;
2912 }
2913
2914 SDValue LoadedVal(N, 0);
2915 if (!LoadedVal.hasOneUse())
2916 return false;
2917
2918 for (SDUse &Use : LD->uses())
2919 if (Use.getResNo() == 0 &&
2920 Use.getUser()->getOpcode() != ISD::SCALAR_TO_VECTOR &&
2921 Use.getUser()->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
2922 return false;
2923
2924 return true;
2925}
2926
2927/// getPreIndexedAddressParts - returns true by value, base pointer and
2928/// offset pointer and addressing mode by reference if the node's address
2929/// can be legally represented as pre-indexed load / store address.
2930bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
2931 SDValue &Offset,
2932 ISD::MemIndexedMode &AM,
2933 SelectionDAG &DAG) const {
2934 if (DisablePPCPreinc) return false;
2935
2936 bool isLoad = true;
2937 SDValue Ptr;
2938 EVT VT;
2939 Align Alignment;
2940 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
2941 Ptr = LD->getBasePtr();
2942 VT = LD->getMemoryVT();
2943 Alignment = LD->getAlign();
2944 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
2945 Ptr = ST->getBasePtr();
2946 VT = ST->getMemoryVT();
2947 Alignment = ST->getAlign();
2948 isLoad = false;
2949 } else
2950 return false;
2951
2952 // Do not generate pre-inc forms for specific loads that feed scalar_to_vector
2953 // instructions because we can fold these into a more efficient instruction
2954 // instead, (such as LXSD).
2955 if (isLoad && usePartialVectorLoads(N, ST: Subtarget)) {
2956 return false;
2957 }
2958
2959 // PowerPC doesn't have preinc load/store instructions for vectors
2960 if (VT.isVector())
2961 return false;
2962
2963 if (SelectAddressRegReg(N: Ptr, Base, Index&: Offset, DAG)) {
2964 // Common code will reject creating a pre-inc form if the base pointer
2965 // is a frame index, or if N is a store and the base pointer is either
2966 // the same as or a predecessor of the value being stored. Check for
2967 // those situations here, and try with swapped Base/Offset instead.
2968 bool Swap = false;
2969
2970 if (isa<FrameIndexSDNode>(Val: Base) || isa<RegisterSDNode>(Val: Base))
2971 Swap = true;
2972 else if (!isLoad) {
2973 SDValue Val = cast<StoreSDNode>(Val: N)->getValue();
2974 if (Val == Base || Base.getNode()->isPredecessorOf(N: Val.getNode()))
2975 Swap = true;
2976 }
2977
2978 if (Swap)
2979 std::swap(a&: Base, b&: Offset);
2980
2981 AM = ISD::PRE_INC;
2982 return true;
2983 }
2984
2985 // LDU/STU can only handle immediates that are a multiple of 4.
2986 if (VT != MVT::i64) {
2987 if (!SelectAddressRegImm(N: Ptr, Disp&: Offset, Base, DAG, EncodingAlignment: std::nullopt))
2988 return false;
2989 } else {
2990 // LDU/STU need an address with at least 4-byte alignment.
2991 if (Alignment < Align(4))
2992 return false;
2993
2994 if (!SelectAddressRegImm(N: Ptr, Disp&: Offset, Base, DAG, EncodingAlignment: Align(4)))
2995 return false;
2996 }
2997
2998 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
2999 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
3000 // sext i32 to i64 when addr mode is r+i.
3001 if (LD->getValueType(ResNo: 0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
3002 LD->getExtensionType() == ISD::SEXTLOAD &&
3003 isa<ConstantSDNode>(Val: Offset))
3004 return false;
3005 }
3006
3007 AM = ISD::PRE_INC;
3008 return true;
3009}
3010
3011//===----------------------------------------------------------------------===//
3012// LowerOperation implementation
3013//===----------------------------------------------------------------------===//
3014
3015/// Return true if we should reference labels using a PICBase, set the HiOpFlags
3016/// and LoOpFlags to the target MO flags.
3017static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
3018 unsigned &HiOpFlags, unsigned &LoOpFlags,
3019 const GlobalValue *GV = nullptr) {
3020 HiOpFlags = PPCII::MO_HA;
3021 LoOpFlags = PPCII::MO_LO;
3022
3023 // Don't use the pic base if not in PIC relocation model.
3024 if (IsPIC) {
3025 HiOpFlags = PPCII::MO_PIC_HA_FLAG;
3026 LoOpFlags = PPCII::MO_PIC_LO_FLAG;
3027 }
3028}
3029
3030static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
3031 SelectionDAG &DAG) {
3032 SDLoc DL(HiPart);
3033 EVT PtrVT = HiPart.getValueType();
3034 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: PtrVT);
3035
3036 SDValue Hi = DAG.getNode(Opcode: PPCISD::Hi, DL, VT: PtrVT, N1: HiPart, N2: Zero);
3037 SDValue Lo = DAG.getNode(Opcode: PPCISD::Lo, DL, VT: PtrVT, N1: LoPart, N2: Zero);
3038
3039 // With PIC, the first instruction is actually "GR+hi(&G)".
3040 if (isPIC)
3041 Hi = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT,
3042 N1: DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL, VT: PtrVT), N2: Hi);
3043
3044 // Generate non-pic code that has direct accesses to the constant pool.
3045 // The address of the global is just (hi(&g)+lo(&g)).
3046 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Hi, N2: Lo);
3047}
3048
3049static void setUsesTOCBasePtr(MachineFunction &MF) {
3050 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3051 FuncInfo->setUsesTOCBasePtr();
3052}
3053
3054static void setUsesTOCBasePtr(SelectionDAG &DAG) {
3055 setUsesTOCBasePtr(DAG.getMachineFunction());
3056}
3057
3058SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
3059 SDValue GA) const {
3060 EVT VT = Subtarget.getScalarIntVT();
3061 SDValue Reg = Subtarget.isPPC64() ? DAG.getRegister(Reg: PPC::X2, VT)
3062 : Subtarget.isAIXABI()
3063 ? DAG.getRegister(Reg: PPC::R2, VT)
3064 : DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT);
3065 SDValue Ops[] = { GA, Reg };
3066 return DAG.getMemIntrinsicNode(
3067 Opcode: PPCISD::TOC_ENTRY, dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Other), Ops, MemVT: VT,
3068 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()), Alignment: std::nullopt,
3069 Flags: MachineMemOperand::MOLoad);
3070}
3071
3072SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
3073 SelectionDAG &DAG) const {
3074 EVT PtrVT = Op.getValueType();
3075 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
3076 const Constant *C = CP->getConstVal();
3077
3078 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3079 // The actual address of the GlobalValue is stored in the TOC.
3080 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3081 if (Subtarget.isUsingPCRelativeCalls()) {
3082 SDLoc DL(CP);
3083 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3084 SDValue ConstPool = DAG.getTargetConstantPool(
3085 C, VT: Ty, Align: CP->getAlign(), Offset: CP->getOffset(), TargetFlags: PPCII::MO_PCREL_FLAG);
3086 return DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: ConstPool);
3087 }
3088 setUsesTOCBasePtr(DAG);
3089 SDValue GA = DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0);
3090 return getTOCEntry(DAG, dl: SDLoc(CP), GA);
3091 }
3092
3093 unsigned MOHiFlag, MOLoFlag;
3094 bool IsPIC = isPositionIndependent();
3095 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3096
3097 if (IsPIC && Subtarget.isSVR4ABI()) {
3098 SDValue GA =
3099 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: PPCII::MO_PIC_FLAG);
3100 return getTOCEntry(DAG, dl: SDLoc(CP), GA);
3101 }
3102
3103 SDValue CPIHi =
3104 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0, TargetFlags: MOHiFlag);
3105 SDValue CPILo =
3106 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0, TargetFlags: MOLoFlag);
3107 return LowerLabelRef(HiPart: CPIHi, LoPart: CPILo, isPIC: IsPIC, DAG);
3108}
3109
3110// For 64-bit PowerPC, prefer the more compact relative encodings.
3111// This trades 32 bits per jump table entry for one or two instructions
3112// on the jump site.
3113unsigned PPCTargetLowering::getJumpTableEncoding() const {
3114 if (isJumpTableRelative())
3115 return MachineJumpTableInfo::EK_LabelDifference32;
3116
3117 return TargetLowering::getJumpTableEncoding();
3118}
3119
3120bool PPCTargetLowering::isJumpTableRelative() const {
3121 if (UseAbsoluteJumpTables)
3122 return false;
3123 if (Subtarget.isPPC64() || Subtarget.isAIXABI())
3124 return true;
3125 return TargetLowering::isJumpTableRelative();
3126}
3127
3128SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
3129 SelectionDAG &DAG) const {
3130 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3131 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3132
3133 switch (getTargetMachine().getCodeModel()) {
3134 case CodeModel::Small:
3135 case CodeModel::Medium:
3136 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3137 default:
3138 return DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: SDLoc(),
3139 VT: getPointerTy(DL: DAG.getDataLayout()));
3140 }
3141}
3142
3143const MCExpr *
3144PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
3145 unsigned JTI,
3146 MCContext &Ctx) const {
3147 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3148 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3149
3150 switch (getTargetMachine().getCodeModel()) {
3151 case CodeModel::Small:
3152 case CodeModel::Medium:
3153 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3154 default:
3155 return MCSymbolRefExpr::create(Symbol: MF->getPICBaseSymbol(), Ctx);
3156 }
3157}
3158
3159SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
3160 EVT PtrVT = Op.getValueType();
3161 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
3162
3163 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3164 if (Subtarget.isUsingPCRelativeCalls()) {
3165 SDLoc DL(JT);
3166 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3167 SDValue GA =
3168 DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: Ty, TargetFlags: PPCII::MO_PCREL_FLAG);
3169 SDValue MatAddr = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3170 return MatAddr;
3171 }
3172
3173 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3174 // The actual address of the GlobalValue is stored in the TOC.
3175 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3176 setUsesTOCBasePtr(DAG);
3177 SDValue GA = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT);
3178 return getTOCEntry(DAG, dl: SDLoc(JT), GA);
3179 }
3180
3181 unsigned MOHiFlag, MOLoFlag;
3182 bool IsPIC = isPositionIndependent();
3183 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3184
3185 if (IsPIC && Subtarget.isSVR4ABI()) {
3186 SDValue GA = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT,
3187 TargetFlags: PPCII::MO_PIC_FLAG);
3188 return getTOCEntry(DAG, dl: SDLoc(GA), GA);
3189 }
3190
3191 SDValue JTIHi = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT, TargetFlags: MOHiFlag);
3192 SDValue JTILo = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT, TargetFlags: MOLoFlag);
3193 return LowerLabelRef(HiPart: JTIHi, LoPart: JTILo, isPIC: IsPIC, DAG);
3194}
3195
3196SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
3197 SelectionDAG &DAG) const {
3198 EVT PtrVT = Op.getValueType();
3199 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Val&: Op);
3200 const BlockAddress *BA = BASDN->getBlockAddress();
3201
3202 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3203 if (Subtarget.isUsingPCRelativeCalls()) {
3204 SDLoc DL(BASDN);
3205 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3206 SDValue GA = DAG.getTargetBlockAddress(BA, VT: Ty, Offset: BASDN->getOffset(),
3207 TargetFlags: PPCII::MO_PCREL_FLAG);
3208 SDValue MatAddr = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3209 return MatAddr;
3210 }
3211
3212 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3213 // The actual BlockAddress is stored in the TOC.
3214 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3215 setUsesTOCBasePtr(DAG);
3216 SDValue GA = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: BASDN->getOffset());
3217 return getTOCEntry(DAG, dl: SDLoc(BASDN), GA);
3218 }
3219
3220 // 32-bit position-independent ELF stores the BlockAddress in the .got.
3221 if (Subtarget.is32BitELFABI() && isPositionIndependent())
3222 return getTOCEntry(
3223 DAG, dl: SDLoc(BASDN),
3224 GA: DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: BASDN->getOffset()));
3225
3226 unsigned MOHiFlag, MOLoFlag;
3227 bool IsPIC = isPositionIndependent();
3228 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3229 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: 0, TargetFlags: MOHiFlag);
3230 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: 0, TargetFlags: MOLoFlag);
3231 return LowerLabelRef(HiPart: TgtBAHi, LoPart: TgtBALo, isPIC: IsPIC, DAG);
3232}
3233
3234SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
3235 SelectionDAG &DAG) const {
3236 if (Subtarget.isAIXABI())
3237 return LowerGlobalTLSAddressAIX(Op, DAG);
3238
3239 return LowerGlobalTLSAddressLinux(Op, DAG);
3240}
3241
3242/// updateForAIXShLibTLSModelOpt - Helper to initialize TLS model opt settings,
3243/// and then apply the update.
3244static void updateForAIXShLibTLSModelOpt(TLSModel::Model &Model,
3245 SelectionDAG &DAG,
3246 const TargetMachine &TM) {
3247 // Initialize TLS model opt setting lazily:
3248 // (1) Use initial-exec for single TLS var references within current function.
3249 // (2) Use local-dynamic for multiple TLS var references within current
3250 // function.
3251 PPCFunctionInfo *FuncInfo =
3252 DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
3253 if (!FuncInfo->isAIXFuncTLSModelOptInitDone()) {
3254 SmallPtrSet<const GlobalValue *, 8> TLSGV;
3255 // Iterate over all instructions within current function, collect all TLS
3256 // global variables (global variables taken as the first parameter to
3257 // Intrinsic::threadlocal_address).
3258 const Function &Func = DAG.getMachineFunction().getFunction();
3259 for (const BasicBlock &BB : Func)
3260 for (const Instruction &I : BB)
3261 if (I.getOpcode() == Instruction::Call)
3262 if (const CallInst *CI = dyn_cast<const CallInst>(Val: &I))
3263 if (Function *CF = CI->getCalledFunction())
3264 if (CF->isDeclaration() &&
3265 CF->getIntrinsicID() == Intrinsic::threadlocal_address)
3266 if (const GlobalValue *GV =
3267 dyn_cast<GlobalValue>(Val: I.getOperand(i: 0))) {
3268 TLSModel::Model GVModel = TM.getTLSModel(GV);
3269 if (GVModel == TLSModel::LocalDynamic)
3270 TLSGV.insert(Ptr: GV);
3271 }
3272
3273 unsigned TLSGVCnt = TLSGV.size();
3274 LLVM_DEBUG(dbgs() << format("LocalDynamic TLSGV count:%d\n", TLSGVCnt));
3275 if (TLSGVCnt <= PPCAIXTLSModelOptUseIEForLDLimit)
3276 FuncInfo->setAIXFuncUseTLSIEForLD();
3277 FuncInfo->setAIXFuncTLSModelOptInitDone();
3278 }
3279
3280 if (FuncInfo->isAIXFuncUseTLSIEForLD()) {
3281 LLVM_DEBUG(
3282 dbgs() << DAG.getMachineFunction().getName()
3283 << " function is using the TLS-IE model for TLS-LD access.\n");
3284 Model = TLSModel::InitialExec;
3285 }
3286}
3287
3288SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
3289 SelectionDAG &DAG) const {
3290 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3291
3292 if (DAG.getTarget().useEmulatedTLS())
3293 report_fatal_error(reason: "Emulated TLS is not yet supported on AIX");
3294
3295 SDLoc dl(GA);
3296 const GlobalValue *GV = GA->getGlobal();
3297 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3298 bool Is64Bit = Subtarget.isPPC64();
3299 TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
3300
3301 // Apply update to the TLS model.
3302 if (Subtarget.hasAIXShLibTLSModelOpt())
3303 updateForAIXShLibTLSModelOpt(Model, DAG, TM: getTargetMachine());
3304
3305 // TLS variables are accessed through TOC entries.
3306 // To support this, set the DAG to use the TOC base pointer.
3307 setUsesTOCBasePtr(DAG);
3308
3309 bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
3310
3311 if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
3312 bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
3313 bool HasAIXSmallTLSGlobalAttr = false;
3314 SDValue VariableOffsetTGA =
3315 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TPREL_FLAG);
3316 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3317 SDValue TLSReg;
3318
3319 if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(Val: GV))
3320 if (GVar->hasAttribute(Kind: "aix-small-tls"))
3321 HasAIXSmallTLSGlobalAttr = true;
3322
3323 if (Is64Bit) {
3324 // For local-exec and initial-exec on AIX (64-bit), the sequence generated
3325 // involves a load of the variable offset (from the TOC), followed by an
3326 // add of the loaded variable offset to R13 (the thread pointer).
3327 // This code sequence looks like:
3328 // ld reg1,var[TC](2)
3329 // add reg2, reg1, r13 // r13 contains the thread pointer
3330 TLSReg = DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
3331
3332 // With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
3333 // global variable attribute, produce a faster access sequence for
3334 // local-exec TLS variables where the offset from the TLS base is encoded
3335 // as an immediate operand.
3336 //
3337 // We only utilize the faster local-exec access sequence when the TLS
3338 // variable has a size within the policy limit. We treat types that are
3339 // not sized or are empty as being over the policy size limit.
3340 if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
3341 IsTLSLocalExecModel) {
3342 Type *GVType = GV->getValueType();
3343 if (GVType->isSized() && !GVType->isEmptyTy() &&
3344 GV->getDataLayout().getTypeAllocSize(Ty: GVType) <=
3345 AIXSmallTlsPolicySizeLimit)
3346 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: VariableOffsetTGA, N2: TLSReg);
3347 }
3348 } else {
3349 // For local-exec and initial-exec on AIX (32-bit), the sequence generated
3350 // involves loading the variable offset from the TOC, generating a call to
3351 // .__get_tpointer to get the thread pointer (which will be in R3), and
3352 // adding the two together:
3353 // lwz reg1,var[TC](2)
3354 // bla .__get_tpointer
3355 // add reg2, reg1, r3
3356 TLSReg = DAG.getNode(Opcode: PPCISD::GET_TPOINTER, DL: dl, VT: PtrVT);
3357
3358 // We do not implement the 32-bit version of the faster access sequence
3359 // for local-exec that is controlled by the -maix-small-local-exec-tls
3360 // option, or the "aix-small-tls" global variable attribute.
3361 if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
3362 report_fatal_error(reason: "The small-local-exec TLS access sequence is "
3363 "currently only supported on AIX (64-bit mode).");
3364 }
3365 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TLSReg, N2: VariableOffset);
3366 }
3367
3368 if (Model == TLSModel::LocalDynamic) {
3369 bool HasAIXSmallLocalDynamicTLS = Subtarget.hasAIXSmallLocalDynamicTLS();
3370
3371 // We do not implement the 32-bit version of the faster access sequence
3372 // for local-dynamic that is controlled by -maix-small-local-dynamic-tls.
3373 if (!Is64Bit && HasAIXSmallLocalDynamicTLS)
3374 report_fatal_error(reason: "The small-local-dynamic TLS access sequence is "
3375 "currently only supported on AIX (64-bit mode).");
3376
3377 // For local-dynamic on AIX, we need to generate one TOC entry for each
3378 // variable offset, and a single module-handle TOC entry for the entire
3379 // file.
3380
3381 SDValue VariableOffsetTGA =
3382 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSLD_FLAG);
3383 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3384
3385 Module *M = DAG.getMachineFunction().getFunction().getParent();
3386 GlobalVariable *TLSGV =
3387 dyn_cast_or_null<GlobalVariable>(Val: M->getOrInsertGlobal(
3388 Name: StringRef("_$TLSML"), Ty: PointerType::getUnqual(C&: *DAG.getContext())));
3389 TLSGV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);
3390 assert(TLSGV && "Not able to create GV for _$TLSML.");
3391 SDValue ModuleHandleTGA =
3392 DAG.getTargetGlobalAddress(GV: TLSGV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSLDM_FLAG);
3393 SDValue ModuleHandleTOC = getTOCEntry(DAG, dl, GA: ModuleHandleTGA);
3394 SDValue ModuleHandle =
3395 DAG.getNode(Opcode: PPCISD::TLSLD_AIX, DL: dl, VT: PtrVT, Operand: ModuleHandleTOC);
3396
3397 // With the -maix-small-local-dynamic-tls option, produce a faster access
3398 // sequence for local-dynamic TLS variables where the offset from the
3399 // module-handle is encoded as an immediate operand.
3400 //
3401 // We only utilize the faster local-dynamic access sequence when the TLS
3402 // variable has a size within the policy limit. We treat types that are
3403 // not sized or are empty as being over the policy size limit.
3404 if (HasAIXSmallLocalDynamicTLS) {
3405 Type *GVType = GV->getValueType();
3406 if (GVType->isSized() && !GVType->isEmptyTy() &&
3407 GV->getDataLayout().getTypeAllocSize(Ty: GVType) <=
3408 AIXSmallTlsPolicySizeLimit)
3409 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: VariableOffsetTGA,
3410 N2: ModuleHandle);
3411 }
3412
3413 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: ModuleHandle, N2: VariableOffset);
3414 }
3415
3416 // If Local- or Initial-exec or Local-dynamic is not possible or specified,
3417 // all GlobalTLSAddress nodes are lowered using the general-dynamic model. We
3418 // need to generate two TOC entries, one for the variable offset, one for the
3419 // region handle. The global address for the TOC entry of the region handle is
3420 // created with the MO_TLSGDM_FLAG flag and the global address for the TOC
3421 // entry of the variable offset is created with MO_TLSGD_FLAG.
3422 SDValue VariableOffsetTGA =
3423 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSGD_FLAG);
3424 SDValue RegionHandleTGA =
3425 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSGDM_FLAG);
3426 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3427 SDValue RegionHandle = getTOCEntry(DAG, dl, GA: RegionHandleTGA);
3428 return DAG.getNode(Opcode: PPCISD::TLSGD_AIX, DL: dl, VT: PtrVT, N1: VariableOffset,
3429 N2: RegionHandle);
3430}
3431
3432SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
3433 SelectionDAG &DAG) const {
3434 // FIXME: TLS addresses currently use medium model code sequences,
3435 // which is the most useful form. Eventually support for small and
3436 // large models could be added if users need it, at the cost of
3437 // additional complexity.
3438 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3439 if (DAG.getTarget().useEmulatedTLS())
3440 return LowerToTLSEmulatedModel(GA, DAG);
3441
3442 SDLoc dl(GA);
3443 const GlobalValue *GV = GA->getGlobal();
3444 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3445 bool is64bit = Subtarget.isPPC64();
3446 const Module *M = DAG.getMachineFunction().getFunction().getParent();
3447 PICLevel::Level picLevel = M->getPICLevel();
3448
3449 const TargetMachine &TM = getTargetMachine();
3450 TLSModel::Model Model = TM.getTLSModel(GV);
3451
3452 if (Model == TLSModel::LocalExec) {
3453 if (Subtarget.isUsingPCRelativeCalls()) {
3454 SDValue TLSReg = DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
3455 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3456 TargetFlags: PPCII::MO_TPREL_PCREL_FLAG);
3457 SDValue MatAddr =
3458 DAG.getNode(Opcode: PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3459 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TLSReg, N2: MatAddr);
3460 }
3461
3462 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3463 TargetFlags: PPCII::MO_TPREL_HA);
3464 SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3465 TargetFlags: PPCII::MO_TPREL_LO);
3466 SDValue TLSReg = is64bit ? DAG.getRegister(Reg: PPC::X13, VT: MVT::i64)
3467 : DAG.getRegister(Reg: PPC::R2, VT: MVT::i32);
3468
3469 SDValue Hi = DAG.getNode(Opcode: PPCISD::Hi, DL: dl, VT: PtrVT, N1: TGAHi, N2: TLSReg);
3470 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: TGALo, N2: Hi);
3471 }
3472
3473 if (Model == TLSModel::InitialExec) {
3474 bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
3475 SDValue TGA = DAG.getTargetGlobalAddress(
3476 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
3477 SDValue TGATLS = DAG.getTargetGlobalAddress(
3478 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS);
3479 SDValue TPOffset;
3480 if (IsPCRel) {
3481 SDValue MatPCRel = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3482 TPOffset = DAG.getLoad(VT: MVT::i64, dl, Chain: DAG.getEntryNode(), Ptr: MatPCRel,
3483 PtrInfo: MachinePointerInfo());
3484 } else {
3485 SDValue GOTPtr;
3486 if (is64bit) {
3487 setUsesTOCBasePtr(DAG);
3488 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3489 GOTPtr =
3490 DAG.getNode(Opcode: PPCISD::ADDIS_GOT_TPREL_HA, DL: dl, VT: PtrVT, N1: GOTReg, N2: TGA);
3491 } else {
3492 if (!TM.isPositionIndependent())
3493 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_GOT, DL: dl, VT: PtrVT);
3494 else if (picLevel == PICLevel::SmallPIC)
3495 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3496 else
3497 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3498 }
3499 TPOffset = DAG.getNode(Opcode: PPCISD::LD_GOT_TPREL_L, DL: dl, VT: PtrVT, N1: TGA, N2: GOTPtr);
3500 }
3501 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TPOffset, N2: TGATLS);
3502 }
3503
3504 if (Model == TLSModel::GeneralDynamic) {
3505 if (Subtarget.isUsingPCRelativeCalls()) {
3506 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3507 TargetFlags: PPCII::MO_GOT_TLSGD_PCREL_FLAG);
3508 return DAG.getNode(Opcode: PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3509 }
3510
3511 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: 0);
3512 SDValue GOTPtr;
3513 if (is64bit) {
3514 setUsesTOCBasePtr(DAG);
3515 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3516 GOTPtr = DAG.getNode(Opcode: PPCISD::ADDIS_TLSGD_HA, DL: dl, VT: PtrVT,
3517 N1: GOTReg, N2: TGA);
3518 } else {
3519 if (picLevel == PICLevel::SmallPIC)
3520 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3521 else
3522 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3523 }
3524 return DAG.getNode(Opcode: PPCISD::ADDI_TLSGD_L_ADDR, DL: dl, VT: PtrVT,
3525 N1: GOTPtr, N2: TGA, N3: TGA);
3526 }
3527
3528 if (Model == TLSModel::LocalDynamic) {
3529 if (Subtarget.isUsingPCRelativeCalls()) {
3530 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3531 TargetFlags: PPCII::MO_GOT_TLSLD_PCREL_FLAG);
3532 SDValue MatPCRel =
3533 DAG.getNode(Opcode: PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3534 return DAG.getNode(Opcode: PPCISD::PADDI_DTPREL, DL: dl, VT: PtrVT, N1: MatPCRel, N2: TGA);
3535 }
3536
3537 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: 0);
3538 SDValue GOTPtr;
3539 if (is64bit) {
3540 setUsesTOCBasePtr(DAG);
3541 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3542 GOTPtr = DAG.getNode(Opcode: PPCISD::ADDIS_TLSLD_HA, DL: dl, VT: PtrVT,
3543 N1: GOTReg, N2: TGA);
3544 } else {
3545 if (picLevel == PICLevel::SmallPIC)
3546 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3547 else
3548 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3549 }
3550 SDValue TLSAddr = DAG.getNode(Opcode: PPCISD::ADDI_TLSLD_L_ADDR, DL: dl,
3551 VT: PtrVT, N1: GOTPtr, N2: TGA, N3: TGA);
3552 SDValue DtvOffsetHi = DAG.getNode(Opcode: PPCISD::ADDIS_DTPREL_HA, DL: dl,
3553 VT: PtrVT, N1: TLSAddr, N2: TGA);
3554 return DAG.getNode(Opcode: PPCISD::ADDI_DTPREL_L, DL: dl, VT: PtrVT, N1: DtvOffsetHi, N2: TGA);
3555 }
3556
3557 llvm_unreachable("Unknown TLS model!");
3558}
3559
3560SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
3561 SelectionDAG &DAG) const {
3562 EVT PtrVT = Op.getValueType();
3563 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Val&: Op);
3564 SDLoc DL(GSDN);
3565 const GlobalValue *GV = GSDN->getGlobal();
3566
3567 // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
3568 // The actual address of the GlobalValue is stored in the TOC.
3569 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3570 if (Subtarget.isUsingPCRelativeCalls()) {
3571 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3572 if (isAccessedAsGotIndirect(N: Op)) {
3573 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: Ty, offset: GSDN->getOffset(),
3574 TargetFlags: PPCII::MO_GOT_PCREL_FLAG);
3575 SDValue MatPCRel = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3576 SDValue Load = DAG.getLoad(VT: MVT::i64, dl: DL, Chain: DAG.getEntryNode(), Ptr: MatPCRel,
3577 PtrInfo: MachinePointerInfo());
3578 return Load;
3579 } else {
3580 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: Ty, offset: GSDN->getOffset(),
3581 TargetFlags: PPCII::MO_PCREL_FLAG);
3582 return DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3583 }
3584 }
3585 setUsesTOCBasePtr(DAG);
3586 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset());
3587 return getTOCEntry(DAG, dl: DL, GA);
3588 }
3589
3590 unsigned MOHiFlag, MOLoFlag;
3591 bool IsPIC = isPositionIndependent();
3592 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag, GV);
3593
3594 if (IsPIC && Subtarget.isSVR4ABI()) {
3595 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT,
3596 offset: GSDN->getOffset(),
3597 TargetFlags: PPCII::MO_PIC_FLAG);
3598 return getTOCEntry(DAG, dl: DL, GA);
3599 }
3600
3601 SDValue GAHi =
3602 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset(), TargetFlags: MOHiFlag);
3603 SDValue GALo =
3604 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset(), TargetFlags: MOLoFlag);
3605
3606 return LowerLabelRef(HiPart: GAHi, LoPart: GALo, isPIC: IsPIC, DAG);
3607}
3608
3609SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
3610 bool IsStrict = Op->isStrictFPOpcode();
3611 const SDNodeFlags Flags = Op.getNode()->getFlags();
3612 ISD::CondCode CC =
3613 cast<CondCodeSDNode>(Val: Op.getOperand(i: IsStrict ? 3 : 2))->get();
3614 SDValue LHS = Op.getOperand(i: IsStrict ? 1 : 0);
3615 SDValue RHS = Op.getOperand(i: IsStrict ? 2 : 1);
3616 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
3617 EVT LHSVT = LHS.getValueType();
3618 SDLoc dl(Op);
3619
3620 // Soften the setcc with libcall if it is fp128 or it is SPE and fp32/fp64.
3621 if (LHSVT == MVT::f128 ||
3622 (Subtarget.hasSPE() && (LHSVT == MVT::f32 || LHSVT == MVT::f64) &&
3623 (!Flags.hasNoNaNs() || !Flags.hasNoInfs()))) {
3624 assert(!Subtarget.hasP9Vector() &&
3625 "SETCC for f128 is already legal under Power9!");
3626 softenSetCCOperands(DAG, VT: LHSVT, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS, Chain,
3627 IsSignaling: Op->getOpcode() == ISD::STRICT_FSETCCS);
3628 if (RHS.getNode())
3629 LHS = DAG.getNode(Opcode: ISD::SETCC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS,
3630 N3: DAG.getCondCode(Cond: CC));
3631 if (IsStrict)
3632 return DAG.getMergeValues(Ops: {LHS, Chain}, dl);
3633 return LHS;
3634 } else if (LHSVT == MVT::f32 || LHSVT == MVT::f64) {
3635 return Op;
3636 }
3637
3638 assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
3639
3640 if (Op.getValueType() == MVT::v2i64) {
3641 // When the operands themselves are v2i64 values, we need to do something
3642 // special because VSX has no underlying comparison operations for these.
3643 if (LHS.getValueType() == MVT::v2i64) {
3644 // Equality can be handled by casting to the legal type for Altivec
3645 // comparisons, everything else needs to be expanded.
3646 if (CC != ISD::SETEQ && CC != ISD::SETNE)
3647 return SDValue();
3648 SDValue SetCC32 = DAG.getSetCC(
3649 DL: dl, VT: MVT::v4i32, LHS: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: LHS),
3650 RHS: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: RHS), Cond: CC);
3651 int ShuffV[] = {1, 0, 3, 2};
3652 SDValue Shuff =
3653 DAG.getVectorShuffle(VT: MVT::v4i32, dl, N1: SetCC32, N2: SetCC32, Mask: ShuffV);
3654 return DAG.getBitcast(VT: MVT::v2i64,
3655 V: DAG.getNode(Opcode: CC == ISD::SETEQ ? ISD::AND : ISD::OR,
3656 DL: dl, VT: MVT::v4i32, N1: Shuff, N2: SetCC32));
3657 }
3658
3659 // We handle most of these in the usual way.
3660 return Op;
3661 }
3662
3663 // If we're comparing for equality to zero, expose the fact that this is
3664 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
3665 // fold the new nodes.
3666 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
3667 return V;
3668
3669 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3670 // Leave comparisons against 0 and -1 alone for now, since they're usually
3671 // optimized. FIXME: revisit this when we can custom lower all setcc
3672 // optimizations.
3673 if (C->isAllOnes() || C->isZero())
3674 return SDValue();
3675 }
3676
3677 // If we have an integer seteq/setne, turn it into a compare against zero
3678 // by xor'ing the rhs with the lhs, which is faster than setting a
3679 // condition register, reading it back out, and masking the correct bit. The
3680 // normal approach here uses sub to do this instead of xor. Using xor exposes
3681 // the result to other bit-twiddling opportunities.
3682 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3683 EVT VT = Op.getValueType();
3684 SDValue Sub = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: LHSVT, N1: LHS, N2: RHS);
3685 return DAG.getSetCC(DL: dl, VT, LHS: Sub, RHS: DAG.getConstant(Val: 0, DL: dl, VT: LHSVT), Cond: CC);
3686 }
3687 return SDValue();
3688}
3689
3690SDValue PPCTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3691 const SDNodeFlags Flags = Op->getFlags();
3692 SDValue Chain = Op.getOperand(i: 0);
3693 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 1))->get();
3694 SDValue LHS = Op.getOperand(i: 2);
3695 SDValue RHS = Op.getOperand(i: 3);
3696 SDValue Dest = Op.getOperand(i: 4);
3697 EVT LHSVT = LHS.getValueType();
3698 SDLoc dl(Op);
3699
3700 assert(Subtarget.hasSPE() && "LowerBR_CC used only for targets with SPE");
3701
3702 if ((LHSVT == MVT::f32 || LHSVT == MVT::f64) && Flags.hasNoNaNs() &&
3703 Flags.hasNoInfs())
3704 return Op;
3705
3706 softenSetCCOperands(DAG, VT: LHSVT, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
3707
3708 // If softenSetCCOperands returned a scalar, we need to compare the result
3709 // against zero to select between true and false values.
3710 if (!RHS) {
3711 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHSVT);
3712 CC = ISD::SETNE;
3713 }
3714
3715 return DAG.getNode(Opcode: ISD::BR_CC, DL: dl, VT: Op.getValueType(), N1: Chain,
3716 N2: DAG.getCondCode(Cond: CC), N3: LHS, N4: RHS, N5: Dest);
3717}
3718
3719SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3720 SDNode *Node = Op.getNode();
3721 EVT VT = Node->getValueType(ResNo: 0);
3722 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3723 SDValue InChain = Node->getOperand(Num: 0);
3724 SDValue VAListPtr = Node->getOperand(Num: 1);
3725 const Value *SV = cast<SrcValueSDNode>(Val: Node->getOperand(Num: 2))->getValue();
3726 SDLoc dl(Node);
3727
3728 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
3729
3730 // gpr_index
3731 SDValue GprIndex = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i32, Chain: InChain,
3732 Ptr: VAListPtr, PtrInfo: MachinePointerInfo(SV), MemVT: MVT::i8);
3733 InChain = GprIndex.getValue(R: 1);
3734
3735 if (VT == MVT::i64) {
3736 // Check if GprIndex is even
3737 SDValue GprAnd = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: GprIndex,
3738 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3739 SDValue CC64 = DAG.getSetCC(DL: dl, VT: MVT::i32, LHS: GprAnd,
3740 RHS: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32), Cond: ISD::SETNE);
3741 SDValue GprIndexPlusOne = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: GprIndex,
3742 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3743 // Align GprIndex to be even if it isn't
3744 GprIndex = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i32, N1: CC64, N2: GprIndexPlusOne,
3745 N3: GprIndex);
3746 }
3747
3748 // fpr index is 1 byte after gpr
3749 SDValue FprPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3750 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3751
3752 // fpr
3753 SDValue FprIndex = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i32, Chain: InChain,
3754 Ptr: FprPtr, PtrInfo: MachinePointerInfo(SV), MemVT: MVT::i8);
3755 InChain = FprIndex.getValue(R: 1);
3756
3757 SDValue RegSaveAreaPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3758 N2: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32));
3759
3760 SDValue OverflowAreaPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3761 N2: DAG.getConstant(Val: 4, DL: dl, VT: MVT::i32));
3762
3763 // areas
3764 SDValue OverflowArea =
3765 DAG.getLoad(VT: MVT::i32, dl, Chain: InChain, Ptr: OverflowAreaPtr, PtrInfo: MachinePointerInfo());
3766 InChain = OverflowArea.getValue(R: 1);
3767
3768 SDValue RegSaveArea =
3769 DAG.getLoad(VT: MVT::i32, dl, Chain: InChain, Ptr: RegSaveAreaPtr, PtrInfo: MachinePointerInfo());
3770 InChain = RegSaveArea.getValue(R: 1);
3771
3772 // select overflow_area if index > 8
3773 SDValue CC = DAG.getSetCC(DL: dl, VT: MVT::i32, LHS: VT.isInteger() ? GprIndex : FprIndex,
3774 RHS: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32), Cond: ISD::SETLT);
3775
3776 // adjustment constant gpr_index * 4/8
3777 SDValue RegConstant = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT: MVT::i32,
3778 N1: VT.isInteger() ? GprIndex : FprIndex,
3779 N2: DAG.getConstant(Val: VT.isInteger() ? 4 : 8, DL: dl,
3780 VT: MVT::i32));
3781
3782 // OurReg = RegSaveArea + RegConstant
3783 SDValue OurReg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: RegSaveArea,
3784 N2: RegConstant);
3785
3786 // Floating types are 32 bytes into RegSaveArea
3787 if (VT.isFloatingPoint())
3788 OurReg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: OurReg,
3789 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
3790
3791 // increase {f,g}pr_index by 1 (or 2 if VT is i64)
3792 SDValue IndexPlus1 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32,
3793 N1: VT.isInteger() ? GprIndex : FprIndex,
3794 N2: DAG.getConstant(Val: VT == MVT::i64 ? 2 : 1, DL: dl,
3795 VT: MVT::i32));
3796
3797 InChain = DAG.getTruncStore(Chain: InChain, dl, Val: IndexPlus1,
3798 Ptr: VT.isInteger() ? VAListPtr : FprPtr,
3799 PtrInfo: MachinePointerInfo(SV), SVT: MVT::i8);
3800
3801 // determine if we should load from reg_save_area or overflow_area
3802 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: PtrVT, N1: CC, N2: OurReg, N3: OverflowArea);
3803
3804 // increase overflow_area by 4/8 if gpr/fpr > 8
3805 SDValue OverflowAreaPlusN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: OverflowArea,
3806 N2: DAG.getConstant(Val: VT.isInteger() ? 4 : 8,
3807 DL: dl, VT: MVT::i32));
3808
3809 OverflowArea = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i32, N1: CC, N2: OverflowArea,
3810 N3: OverflowAreaPlusN);
3811
3812 InChain = DAG.getTruncStore(Chain: InChain, dl, Val: OverflowArea, Ptr: OverflowAreaPtr,
3813 PtrInfo: MachinePointerInfo(), SVT: MVT::i32);
3814
3815 return DAG.getLoad(VT, dl, Chain: InChain, Ptr: Result, PtrInfo: MachinePointerInfo());
3816}
3817
3818SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
3819 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
3820
3821 // We have to copy the entire va_list struct:
3822 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
3823 return DAG.getMemcpy(Chain: Op.getOperand(i: 0), dl: Op, Dst: Op.getOperand(i: 1), Src: Op.getOperand(i: 2),
3824 Size: DAG.getConstant(Val: 12, DL: SDLoc(Op), VT: MVT::i32), DstAlign: Align(8),
3825 SrcAlign: Align(8), isVol: false, AlwaysInline: true, /*CI=*/nullptr, OverrideTailCall: std::nullopt,
3826 DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
3827}
3828
3829SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
3830 SelectionDAG &DAG) const {
3831 return Op.getOperand(i: 0);
3832}
3833
3834SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
3835 MachineFunction &MF = DAG.getMachineFunction();
3836 PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
3837
3838 assert((Op.getOpcode() == ISD::INLINEASM ||
3839 Op.getOpcode() == ISD::INLINEASM_BR) &&
3840 "Expecting Inline ASM node.");
3841
3842 // If an LR store is already known to be required then there is not point in
3843 // checking this ASM as well.
3844 if (MFI.isLRStoreRequired())
3845 return Op;
3846
3847 // Inline ASM nodes have an optional last operand that is an incoming Flag of
3848 // type MVT::Glue. We want to ignore this last operand if that is the case.
3849 unsigned NumOps = Op.getNumOperands();
3850 if (Op.getOperand(i: NumOps - 1).getValueType() == MVT::Glue)
3851 --NumOps;
3852
3853 // Check all operands that may contain the LR.
3854 for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
3855 const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
3856 unsigned NumVals = Flags.getNumOperandRegisters();
3857 ++i; // Skip the ID value.
3858
3859 switch (Flags.getKind()) {
3860 default:
3861 llvm_unreachable("Bad flags!");
3862 case InlineAsm::Kind::RegUse:
3863 case InlineAsm::Kind::Imm:
3864 case InlineAsm::Kind::Mem:
3865 i += NumVals;
3866 break;
3867 case InlineAsm::Kind::Clobber:
3868 case InlineAsm::Kind::RegDef:
3869 case InlineAsm::Kind::RegDefEarlyClobber: {
3870 for (; NumVals; --NumVals, ++i) {
3871 Register Reg = cast<RegisterSDNode>(Val: Op.getOperand(i))->getReg();
3872 if (Reg != PPC::LR && Reg != PPC::LR8)
3873 continue;
3874 MFI.setLRStoreRequired();
3875 return Op;
3876 }
3877 break;
3878 }
3879 }
3880 }
3881
3882 return Op;
3883}
3884
3885SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
3886 SelectionDAG &DAG) const {
3887 SDValue Chain = Op.getOperand(i: 0);
3888 SDValue Trmp = Op.getOperand(i: 1); // trampoline
3889 SDValue FPtr = Op.getOperand(i: 2); // nested function
3890 SDValue Nest = Op.getOperand(i: 3); // 'nest' parameter value
3891 SDLoc dl(Op);
3892
3893 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3894
3895 if (Subtarget.isAIXABI()) {
3896 // On AIX we create a trampoline descriptor by combining the
3897 // entry point and TOC from the global descriptor (FPtr) with the
3898 // nest argument as the environment pointer.
3899 uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4;
3900 MaybeAlign PointerAlign(PointerSize);
3901 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
3902 ? (MachineMemOperand::MODereferenceable |
3903 MachineMemOperand::MOInvariant)
3904 : MachineMemOperand::MONone;
3905
3906 uint64_t TOCPointerOffset = 1 * PointerSize;
3907 uint64_t EnvPointerOffset = 2 * PointerSize;
3908 SDValue SDTOCPtrOffset = DAG.getConstant(Val: TOCPointerOffset, DL: dl, VT: PtrVT);
3909 SDValue SDEnvPtrOffset = DAG.getConstant(Val: EnvPointerOffset, DL: dl, VT: PtrVT);
3910
3911 const Value *TrampolineAddr =
3912 cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
3913 const Function *Func =
3914 cast<Function>(Val: cast<SrcValueSDNode>(Val: Op.getOperand(i: 5))->getValue());
3915
3916 SDValue OutChains[3];
3917
3918 // Copy the entry point address from the global descriptor to the
3919 // trampoline buffer.
3920 SDValue LoadEntryPoint =
3921 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: FPtr, PtrInfo: MachinePointerInfo(Func, 0),
3922 Alignment: PointerAlign, MMOFlags);
3923 SDValue EPLoadChain = LoadEntryPoint.getValue(R: 1);
3924 OutChains[0] = DAG.getStore(Chain: EPLoadChain, dl, Val: LoadEntryPoint, Ptr: Trmp,
3925 PtrInfo: MachinePointerInfo(TrampolineAddr, 0));
3926
3927 // Copy the TOC pointer from the global descriptor to the trampoline
3928 // buffer.
3929 SDValue TOCFromDescriptorPtr =
3930 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FPtr, N2: SDTOCPtrOffset);
3931 SDValue TOCReg = DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: TOCFromDescriptorPtr,
3932 PtrInfo: MachinePointerInfo(Func, TOCPointerOffset),
3933 Alignment: PointerAlign, MMOFlags);
3934 SDValue TrampolineTOCPointer =
3935 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Trmp, N2: SDTOCPtrOffset);
3936 SDValue TOCLoadChain = TOCReg.getValue(R: 1);
3937 OutChains[1] =
3938 DAG.getStore(Chain: TOCLoadChain, dl, Val: TOCReg, Ptr: TrampolineTOCPointer,
3939 PtrInfo: MachinePointerInfo(TrampolineAddr, TOCPointerOffset));
3940
3941 // Store the nest argument into the environment pointer in the trampoline
3942 // buffer.
3943 SDValue EnvPointer = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Trmp, N2: SDEnvPtrOffset);
3944 OutChains[2] =
3945 DAG.getStore(Chain, dl, Val: Nest, Ptr: EnvPointer,
3946 PtrInfo: MachinePointerInfo(TrampolineAddr, EnvPointerOffset));
3947
3948 SDValue TokenFactor =
3949 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: OutChains);
3950 return TokenFactor;
3951 }
3952
3953 bool isPPC64 = (PtrVT == MVT::i64);
3954 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
3955
3956 TargetLowering::ArgListTy Args;
3957 Args.emplace_back(args&: Trmp, args&: IntPtrTy);
3958 // TrampSize == (isPPC64 ? 48 : 40);
3959 Args.emplace_back(
3960 args: DAG.getConstant(Val: isPPC64 ? 48 : 40, DL: dl, VT: Subtarget.getScalarIntVT()),
3961 args&: IntPtrTy);
3962 Args.emplace_back(args&: FPtr, args&: IntPtrTy);
3963 Args.emplace_back(args&: Nest, args&: IntPtrTy);
3964
3965 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
3966 TargetLowering::CallLoweringInfo CLI(DAG);
3967 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3968 CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *DAG.getContext()),
3969 Target: DAG.getExternalSymbol(Sym: "__trampoline_setup", VT: PtrVT), ArgsList: std::move(Args));
3970
3971 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3972 return CallResult.second;
3973}
3974
3975SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3976 MachineFunction &MF = DAG.getMachineFunction();
3977 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3978 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
3979
3980 SDLoc dl(Op);
3981
3982 if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
3983 // vastart just stores the address of the VarArgsFrameIndex slot into the
3984 // memory location argument.
3985 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
3986 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
3987 return DAG.getStore(Chain: Op.getOperand(i: 0), dl, Val: FR, Ptr: Op.getOperand(i: 1),
3988 PtrInfo: MachinePointerInfo(SV));
3989 }
3990
3991 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
3992 // We suppose the given va_list is already allocated.
3993 //
3994 // typedef struct {
3995 // char gpr; /* index into the array of 8 GPRs
3996 // * stored in the register save area
3997 // * gpr=0 corresponds to r3,
3998 // * gpr=1 to r4, etc.
3999 // */
4000 // char fpr; /* index into the array of 8 FPRs
4001 // * stored in the register save area
4002 // * fpr=0 corresponds to f1,
4003 // * fpr=1 to f2, etc.
4004 // */
4005 // char *overflow_arg_area;
4006 // /* location on stack that holds
4007 // * the next overflow argument
4008 // */
4009 // char *reg_save_area;
4010 // /* where r3:r10 and f1:f8 (if saved)
4011 // * are stored
4012 // */
4013 // } va_list[1];
4014
4015 SDValue ArgGPR = DAG.getConstant(Val: FuncInfo->getVarArgsNumGPR(), DL: dl, VT: MVT::i32);
4016 SDValue ArgFPR = DAG.getConstant(Val: FuncInfo->getVarArgsNumFPR(), DL: dl, VT: MVT::i32);
4017 SDValue StackOffsetFI = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackOffset(),
4018 VT: PtrVT);
4019 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(),
4020 VT: PtrVT);
4021
4022 uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
4023 SDValue ConstFrameOffset = DAG.getConstant(Val: FrameOffset, DL: dl, VT: PtrVT);
4024
4025 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
4026 SDValue ConstStackOffset = DAG.getConstant(Val: StackOffset, DL: dl, VT: PtrVT);
4027
4028 uint64_t FPROffset = 1;
4029 SDValue ConstFPROffset = DAG.getConstant(Val: FPROffset, DL: dl, VT: PtrVT);
4030
4031 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
4032
4033 // Store first byte : number of int regs
4034 SDValue firstStore =
4035 DAG.getTruncStore(Chain: Op.getOperand(i: 0), dl, Val: ArgGPR, Ptr: Op.getOperand(i: 1),
4036 PtrInfo: MachinePointerInfo(SV), SVT: MVT::i8);
4037 uint64_t nextOffset = FPROffset;
4038 SDValue nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Op.getOperand(i: 1),
4039 N2: ConstFPROffset);
4040
4041 // Store second byte : number of float regs
4042 SDValue secondStore =
4043 DAG.getTruncStore(Chain: firstStore, dl, Val: ArgFPR, Ptr: nextPtr,
4044 PtrInfo: MachinePointerInfo(SV, nextOffset), SVT: MVT::i8);
4045 nextOffset += StackOffset;
4046 nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: nextPtr, N2: ConstStackOffset);
4047
4048 // Store second word : arguments given on stack
4049 SDValue thirdStore = DAG.getStore(Chain: secondStore, dl, Val: StackOffsetFI, Ptr: nextPtr,
4050 PtrInfo: MachinePointerInfo(SV, nextOffset));
4051 nextOffset += FrameOffset;
4052 nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: nextPtr, N2: ConstFrameOffset);
4053
4054 // Store third word : arguments given in registers
4055 return DAG.getStore(Chain: thirdStore, dl, Val: FR, Ptr: nextPtr,
4056 PtrInfo: MachinePointerInfo(SV, nextOffset));
4057}
4058
4059/// FPR - The set of FP registers that should be allocated for arguments
4060/// on Darwin and AIX.
4061static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
4062 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
4063 PPC::F11, PPC::F12, PPC::F13};
4064
4065/// CalculateStackSlotSize - Calculates the size reserved for this argument on
4066/// the stack.
4067static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
4068 unsigned PtrByteSize) {
4069 unsigned ArgSize = ArgVT.getStoreSize();
4070 if (Flags.isByVal())
4071 ArgSize = Flags.getByValSize();
4072
4073 // Round up to multiples of the pointer size, except for array members,
4074 // which are always packed.
4075 if (!Flags.isInConsecutiveRegs())
4076 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4077
4078 return ArgSize;
4079}
4080
4081/// CalculateStackSlotAlignment - Calculates the alignment of this argument
4082/// on the stack.
4083static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
4084 ISD::ArgFlagsTy Flags,
4085 unsigned PtrByteSize) {
4086 Align Alignment(PtrByteSize);
4087
4088 // Altivec parameters are padded to a 16 byte boundary.
4089 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4090 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4091 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4092 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4093 Alignment = Align(16);
4094
4095 // ByVal parameters are aligned as requested.
4096 if (Flags.isByVal()) {
4097 auto BVAlign = Flags.getNonZeroByValAlign();
4098 if (BVAlign > PtrByteSize) {
4099 if (BVAlign.value() % PtrByteSize != 0)
4100 llvm_unreachable(
4101 "ByVal alignment is not a multiple of the pointer size");
4102
4103 Alignment = BVAlign;
4104 }
4105 }
4106
4107 // Array members are always packed to their original alignment.
4108 if (Flags.isInConsecutiveRegs()) {
4109 // If the array member was split into multiple registers, the first
4110 // needs to be aligned to the size of the full type. (Except for
4111 // ppcf128, which is only aligned as its f64 components.)
4112 if (Flags.isSplit() && OrigVT != MVT::ppcf128)
4113 Alignment = Align(OrigVT.getStoreSize());
4114 else
4115 Alignment = Align(ArgVT.getStoreSize());
4116 }
4117
4118 return Alignment;
4119}
4120
4121/// CalculateStackSlotUsed - Return whether this argument will use its
4122/// stack slot (instead of being passed in registers). ArgOffset,
4123/// AvailableFPRs, and AvailableVRs must hold the current argument
4124/// position, and will be updated to account for this argument.
4125static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
4126 unsigned PtrByteSize, unsigned LinkageSize,
4127 unsigned ParamAreaSize, unsigned &ArgOffset,
4128 unsigned &AvailableFPRs,
4129 unsigned &AvailableVRs) {
4130 bool UseMemory = false;
4131
4132 // Respect alignment of argument on the stack.
4133 Align Alignment =
4134 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
4135 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
4136 // If there's no space left in the argument save area, we must
4137 // use memory (this check also catches zero-sized arguments).
4138 if (ArgOffset >= LinkageSize + ParamAreaSize)
4139 UseMemory = true;
4140
4141 // Allocate argument on the stack.
4142 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
4143 if (Flags.isInConsecutiveRegsLast())
4144 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4145 // If we overran the argument save area, we must use memory
4146 // (this check catches arguments passed partially in memory)
4147 if (ArgOffset > LinkageSize + ParamAreaSize)
4148 UseMemory = true;
4149
4150 // However, if the argument is actually passed in an FPR or a VR,
4151 // we don't use memory after all.
4152 if (!Flags.isByVal()) {
4153 if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
4154 if (AvailableFPRs > 0) {
4155 --AvailableFPRs;
4156 return false;
4157 }
4158 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4159 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4160 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4161 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4162 if (AvailableVRs > 0) {
4163 --AvailableVRs;
4164 return false;
4165 }
4166 }
4167
4168 return UseMemory;
4169}
4170
4171/// EnsureStackAlignment - Round stack frame size up from NumBytes to
4172/// ensure minimum alignment required for target.
4173static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
4174 unsigned NumBytes) {
4175 return alignTo(Size: NumBytes, A: Lowering->getStackAlign());
4176}
4177
4178SDValue PPCTargetLowering::LowerFormalArguments(
4179 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4180 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4181 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4182 if (Subtarget.isAIXABI())
4183 return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
4184 InVals);
4185 if (Subtarget.is64BitELFABI())
4186 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4187 InVals);
4188 assert(Subtarget.is32BitELFABI());
4189 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4190 InVals);
4191}
4192
4193SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
4194 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4195 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4196 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4197
4198 // 32-bit SVR4 ABI Stack Frame Layout:
4199 // +-----------------------------------+
4200 // +--> | Back chain |
4201 // | +-----------------------------------+
4202 // | | Floating-point register save area |
4203 // | +-----------------------------------+
4204 // | | General register save area |
4205 // | +-----------------------------------+
4206 // | | CR save word |
4207 // | +-----------------------------------+
4208 // | | VRSAVE save word |
4209 // | +-----------------------------------+
4210 // | | Alignment padding |
4211 // | +-----------------------------------+
4212 // | | Vector register save area |
4213 // | +-----------------------------------+
4214 // | | Local variable space |
4215 // | +-----------------------------------+
4216 // | | Parameter list area |
4217 // | +-----------------------------------+
4218 // | | LR save word |
4219 // | +-----------------------------------+
4220 // SP--> +--- | Back chain |
4221 // +-----------------------------------+
4222 //
4223 // Specifications:
4224 // System V Application Binary Interface PowerPC Processor Supplement
4225 // AltiVec Technology Programming Interface Manual
4226
4227 MachineFunction &MF = DAG.getMachineFunction();
4228 MachineFrameInfo &MFI = MF.getFrameInfo();
4229 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4230
4231 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
4232 // Potential tail calls could cause overwriting of argument stack slots.
4233 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4234 (CallConv == CallingConv::Fast));
4235 const Align PtrAlign(4);
4236
4237 // Assign locations to all of the incoming arguments.
4238 SmallVector<CCValAssign, 16> ArgLocs;
4239 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4240 *DAG.getContext());
4241
4242 // Reserve space for the linkage area on the stack.
4243 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4244 CCInfo.AllocateStack(Size: LinkageSize, Alignment: PtrAlign);
4245 CCInfo.AnalyzeFormalArguments(Ins, Fn: CC_PPC32_SVR4);
4246
4247 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4248 CCValAssign &VA = ArgLocs[i];
4249
4250 // Arguments stored in registers.
4251 if (VA.isRegLoc()) {
4252 const TargetRegisterClass *RC;
4253 EVT ValVT = VA.getValVT();
4254
4255 switch (ValVT.getSimpleVT().SimpleTy) {
4256 default:
4257 llvm_unreachable("ValVT not supported by formal arguments Lowering");
4258 case MVT::i1:
4259 case MVT::i32:
4260 RC = &PPC::GPRCRegClass;
4261 break;
4262 case MVT::f32:
4263 if (Subtarget.hasP8Vector())
4264 RC = &PPC::VSSRCRegClass;
4265 else if (Subtarget.hasSPE())
4266 RC = &PPC::GPRCRegClass;
4267 else
4268 RC = &PPC::F4RCRegClass;
4269 break;
4270 case MVT::f64:
4271 if (Subtarget.hasVSX())
4272 RC = &PPC::VSFRCRegClass;
4273 else if (Subtarget.hasSPE())
4274 // SPE passes doubles in GPR pairs.
4275 RC = &PPC::GPRCRegClass;
4276 else
4277 RC = &PPC::F8RCRegClass;
4278 break;
4279 case MVT::v16i8:
4280 case MVT::v8i16:
4281 case MVT::v4i32:
4282 RC = &PPC::VRRCRegClass;
4283 break;
4284 case MVT::v4f32:
4285 RC = &PPC::VRRCRegClass;
4286 break;
4287 case MVT::v2f64:
4288 case MVT::v2i64:
4289 RC = &PPC::VRRCRegClass;
4290 break;
4291 }
4292
4293 SDValue ArgValue;
4294 // Transform the arguments stored in physical registers into
4295 // virtual ones.
4296 if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
4297 assert(i + 1 < e && "No second half of double precision argument");
4298 Register RegLo = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4299 Register RegHi = MF.addLiveIn(PReg: ArgLocs[++i].getLocReg(), RC);
4300 SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, Reg: RegLo, VT: MVT::i32);
4301 SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, Reg: RegHi, VT: MVT::i32);
4302 if (!Subtarget.isLittleEndian())
4303 std::swap (a&: ArgValueLo, b&: ArgValueHi);
4304 ArgValue = DAG.getNode(Opcode: PPCISD::BUILD_SPE64, DL: dl, VT: MVT::f64, N1: ArgValueLo,
4305 N2: ArgValueHi);
4306 } else {
4307 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4308 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
4309 VT: ValVT == MVT::i1 ? MVT::i32 : ValVT);
4310 if (ValVT == MVT::i1)
4311 ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: ArgValue);
4312 }
4313
4314 InVals.push_back(Elt: ArgValue);
4315 } else {
4316 // Argument stored in memory.
4317 assert(VA.isMemLoc());
4318
4319 // Get the extended size of the argument type in stack
4320 unsigned ArgSize = VA.getLocVT().getStoreSize();
4321 // Get the actual size of the argument type
4322 unsigned ObjSize = VA.getValVT().getStoreSize();
4323 unsigned ArgOffset = VA.getLocMemOffset();
4324 // Stack objects in PPC32 are right justified.
4325 ArgOffset += ArgSize - ObjSize;
4326 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: isImmutable);
4327
4328 // Create load nodes to retrieve arguments from the stack.
4329 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4330 InVals.push_back(
4331 Elt: DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo()));
4332 }
4333 }
4334
4335 // Assign locations to all of the incoming aggregate by value arguments.
4336 // Aggregates passed by value are stored in the local variable space of the
4337 // caller's stack frame, right above the parameter list area.
4338 SmallVector<CCValAssign, 16> ByValArgLocs;
4339 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
4340 ByValArgLocs, *DAG.getContext());
4341
4342 // Reserve stack space for the allocations in CCInfo.
4343 CCByValInfo.AllocateStack(Size: CCInfo.getStackSize(), Alignment: PtrAlign);
4344
4345 CCByValInfo.AnalyzeFormalArguments(Ins, Fn: CC_PPC32_SVR4_ByVal);
4346
4347 // Area that is at least reserved in the caller of this function.
4348 unsigned MinReservedArea = CCByValInfo.getStackSize();
4349 MinReservedArea = std::max(a: MinReservedArea, b: LinkageSize);
4350
4351 // Set the size that is at least reserved in caller of this function. Tail
4352 // call optimized function's reserved stack space needs to be aligned so that
4353 // taking the difference between two stack areas will result in an aligned
4354 // stack.
4355 MinReservedArea =
4356 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: MinReservedArea);
4357 FuncInfo->setMinReservedArea(MinReservedArea);
4358
4359 SmallVector<SDValue, 8> MemOps;
4360
4361 // If the function takes variable number of arguments, make a frame index for
4362 // the start of the first vararg value... for expansion of llvm.va_start.
4363 if (isVarArg) {
4364 static const MCPhysReg GPArgRegs[] = {
4365 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
4366 PPC::R7, PPC::R8, PPC::R9, PPC::R10,
4367 };
4368 const unsigned NumGPArgRegs = std::size(GPArgRegs);
4369
4370 static const MCPhysReg FPArgRegs[] = {
4371 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
4372 PPC::F8
4373 };
4374 unsigned NumFPArgRegs = std::size(FPArgRegs);
4375
4376 if (useSoftFloat() || hasSPE())
4377 NumFPArgRegs = 0;
4378
4379 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(Regs: GPArgRegs));
4380 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(Regs: FPArgRegs));
4381
4382 // Make room for NumGPArgRegs and NumFPArgRegs.
4383 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
4384 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
4385
4386 FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject(
4387 Size: PtrVT.getSizeInBits() / 8, SPOffset: CCInfo.getStackSize(), IsImmutable: true));
4388
4389 FuncInfo->setVarArgsFrameIndex(
4390 MFI.CreateStackObject(Size: Depth, Alignment: Align(8), isSpillSlot: false));
4391 SDValue FIN = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4392
4393 // The fixed integer arguments of a variadic function are stored to the
4394 // VarArgsFrameIndex on the stack so that they may be loaded by
4395 // dereferencing the result of va_next.
4396 for (MCPhysReg GPArgReg : GPArgRegs) {
4397 // Get an existing live-in vreg, or add a new one.
4398 Register VReg = MF.getRegInfo().getLiveInVirtReg(PReg: GPArgReg);
4399 if (!VReg)
4400 VReg = MF.addLiveIn(PReg: GPArgReg, RC: &PPC::GPRCRegClass);
4401
4402 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4403 SDValue Store =
4404 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4405 MemOps.push_back(Elt: Store);
4406 // Increment the address by four for the next argument to store
4407 SDValue PtrOff = DAG.getConstant(Val: PtrVT.getSizeInBits()/8, DL: dl, VT: PtrVT);
4408 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4409 }
4410
4411 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
4412 // is set.
4413 // The double arguments are stored to the VarArgsFrameIndex
4414 // on the stack.
4415 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
4416 // Get an existing live-in vreg, or add a new one.
4417 Register VReg = MF.getRegInfo().getLiveInVirtReg(PReg: FPArgRegs[FPRIndex]);
4418 if (!VReg)
4419 VReg = MF.addLiveIn(PReg: FPArgRegs[FPRIndex], RC: &PPC::F8RCRegClass);
4420
4421 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::f64);
4422 SDValue Store =
4423 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4424 MemOps.push_back(Elt: Store);
4425 // Increment the address by eight for the next argument to store
4426 SDValue PtrOff = DAG.getConstant(Val: MVT(MVT::f64).getSizeInBits()/8, DL: dl,
4427 VT: PtrVT);
4428 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4429 }
4430 }
4431
4432 if (!MemOps.empty())
4433 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
4434
4435 return Chain;
4436}
4437
4438// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4439// value to MVT::i64 and then truncate to the correct register size.
4440SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
4441 EVT ObjectVT, SelectionDAG &DAG,
4442 SDValue ArgVal,
4443 const SDLoc &dl) const {
4444 if (Flags.isSExt())
4445 ArgVal = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: MVT::i64, N1: ArgVal,
4446 N2: DAG.getValueType(ObjectVT));
4447 else if (Flags.isZExt())
4448 ArgVal = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: MVT::i64, N1: ArgVal,
4449 N2: DAG.getValueType(ObjectVT));
4450
4451 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ObjectVT, Operand: ArgVal);
4452}
4453
4454SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
4455 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4456 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4457 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4458 // TODO: add description of PPC stack frame format, or at least some docs.
4459 //
4460 bool isELFv2ABI = Subtarget.isELFv2ABI();
4461 bool isLittleEndian = Subtarget.isLittleEndian();
4462 MachineFunction &MF = DAG.getMachineFunction();
4463 MachineFrameInfo &MFI = MF.getFrameInfo();
4464 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4465
4466 assert(!(CallConv == CallingConv::Fast && isVarArg) &&
4467 "fastcc not supported on varargs functions");
4468
4469 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
4470 // Potential tail calls could cause overwriting of argument stack slots.
4471 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4472 (CallConv == CallingConv::Fast));
4473 unsigned PtrByteSize = 8;
4474 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4475
4476 static const MCPhysReg GPR[] = {
4477 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4478 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4479 };
4480 static const MCPhysReg VR[] = {
4481 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4482 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4483 };
4484
4485 const unsigned Num_GPR_Regs = std::size(GPR);
4486 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
4487 const unsigned Num_VR_Regs = std::size(VR);
4488
4489 // Do a first pass over the arguments to determine whether the ABI
4490 // guarantees that our caller has allocated the parameter save area
4491 // on its stack frame. In the ELFv1 ABI, this is always the case;
4492 // in the ELFv2 ABI, it is true if this is a vararg function or if
4493 // any parameter is located in a stack slot.
4494
4495 bool HasParameterArea = !isELFv2ABI || isVarArg;
4496 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
4497 unsigned NumBytes = LinkageSize;
4498 unsigned AvailableFPRs = Num_FPR_Regs;
4499 unsigned AvailableVRs = Num_VR_Regs;
4500 for (const ISD::InputArg &In : Ins) {
4501 if (In.Flags.isNest())
4502 continue;
4503
4504 if (CalculateStackSlotUsed(ArgVT: In.VT, OrigVT: In.ArgVT, Flags: In.Flags, PtrByteSize,
4505 LinkageSize, ParamAreaSize, ArgOffset&: NumBytes,
4506 AvailableFPRs, AvailableVRs))
4507 HasParameterArea = true;
4508 }
4509
4510 // Add DAG nodes to load the arguments or copy them out of registers. On
4511 // entry to a function on PPC, the arguments start after the linkage area,
4512 // although the first ones are often in registers.
4513
4514 unsigned ArgOffset = LinkageSize;
4515 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
4516 SmallVector<SDValue, 8> MemOps;
4517 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
4518 unsigned CurArgIdx = 0;
4519 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
4520 SDValue ArgVal;
4521 bool needsLoad = false;
4522 EVT ObjectVT = Ins[ArgNo].VT;
4523 EVT OrigVT = Ins[ArgNo].ArgVT;
4524 unsigned ObjSize = ObjectVT.getStoreSize();
4525 unsigned ArgSize = ObjSize;
4526 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
4527 if (Ins[ArgNo].isOrigArg()) {
4528 std::advance(i&: FuncArg, n: Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
4529 CurArgIdx = Ins[ArgNo].getOrigArgIndex();
4530 }
4531 // We re-align the argument offset for each argument, except when using the
4532 // fast calling convention, when we need to make sure we do that only when
4533 // we'll actually use a stack slot.
4534 unsigned CurArgOffset;
4535 Align Alignment;
4536 auto ComputeArgOffset = [&]() {
4537 /* Respect alignment of argument on the stack. */
4538 Alignment =
4539 CalculateStackSlotAlignment(ArgVT: ObjectVT, OrigVT, Flags, PtrByteSize);
4540 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
4541 CurArgOffset = ArgOffset;
4542 };
4543
4544 if (CallConv != CallingConv::Fast) {
4545 ComputeArgOffset();
4546
4547 /* Compute GPR index associated with argument offset. */
4548 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4549 GPR_idx = std::min(a: GPR_idx, b: Num_GPR_Regs);
4550 }
4551
4552 // FIXME the codegen can be much improved in some cases.
4553 // We do not have to keep everything in memory.
4554 if (Flags.isByVal()) {
4555 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
4556
4557 if (CallConv == CallingConv::Fast)
4558 ComputeArgOffset();
4559
4560 // ObjSize is the true size, ArgSize rounded up to multiple of registers.
4561 ObjSize = Flags.getByValSize();
4562 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4563 // Empty aggregate parameters do not take up registers. Examples:
4564 // struct { } a;
4565 // union { } b;
4566 // int c[0];
4567 // etc. However, we have to provide a place-holder in InVals, so
4568 // pretend we have an 8-byte item at the current address for that
4569 // purpose.
4570 if (!ObjSize) {
4571 int FI = MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: ArgOffset, IsImmutable: true);
4572 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4573 InVals.push_back(Elt: FIN);
4574 continue;
4575 }
4576
4577 // Create a stack object covering all stack doublewords occupied
4578 // by the argument. If the argument is (fully or partially) on
4579 // the stack, or if the argument is fully in registers but the
4580 // caller has allocated the parameter save anyway, we can refer
4581 // directly to the caller's stack frame. Otherwise, create a
4582 // local copy in our own frame.
4583 int FI;
4584 if (HasParameterArea ||
4585 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
4586 FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: false, isAliased: true);
4587 else
4588 FI = MFI.CreateStackObject(Size: ArgSize, Alignment, isSpillSlot: false);
4589 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4590
4591 // Handle aggregates smaller than 8 bytes.
4592 if (ObjSize < PtrByteSize) {
4593 // The value of the object is its address, which differs from the
4594 // address of the enclosing doubleword on big-endian systems.
4595 SDValue Arg = FIN;
4596 if (!isLittleEndian) {
4597 SDValue ArgOff = DAG.getConstant(Val: PtrByteSize - ObjSize, DL: dl, VT: PtrVT);
4598 Arg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: ArgOff.getValueType(), N1: Arg, N2: ArgOff);
4599 }
4600 InVals.push_back(Elt: Arg);
4601
4602 if (GPR_idx != Num_GPR_Regs) {
4603 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4604 FuncInfo->addLiveInAttr(VReg, Flags);
4605 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4606 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: ObjSize * 8);
4607 SDValue Store =
4608 DAG.getTruncStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: Arg,
4609 PtrInfo: MachinePointerInfo(&*FuncArg), SVT: ObjType);
4610 MemOps.push_back(Elt: Store);
4611 }
4612 // Whether we copied from a register or not, advance the offset
4613 // into the parameter save area by a full doubleword.
4614 ArgOffset += PtrByteSize;
4615 continue;
4616 }
4617
4618 // The value of the object is its address, which is the address of
4619 // its first stack doubleword.
4620 InVals.push_back(Elt: FIN);
4621
4622 // Store whatever pieces of the object are in registers to memory.
4623 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
4624 if (GPR_idx == Num_GPR_Regs)
4625 break;
4626
4627 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx], RC: &PPC::G8RCRegClass);
4628 FuncInfo->addLiveInAttr(VReg, Flags);
4629 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4630 SDValue Addr = FIN;
4631 if (j) {
4632 SDValue Off = DAG.getConstant(Val: j, DL: dl, VT: PtrVT);
4633 Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Off.getValueType(), N1: Addr, N2: Off);
4634 }
4635 unsigned StoreSizeInBits = std::min(a: PtrByteSize, b: (ObjSize - j)) * 8;
4636 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreSizeInBits);
4637 SDValue Store =
4638 DAG.getTruncStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: Addr,
4639 PtrInfo: MachinePointerInfo(&*FuncArg, j), SVT: ObjType);
4640 MemOps.push_back(Elt: Store);
4641 ++GPR_idx;
4642 }
4643 ArgOffset += ArgSize;
4644 continue;
4645 }
4646
4647 switch (ObjectVT.getSimpleVT().SimpleTy) {
4648 default: llvm_unreachable("Unhandled argument type!");
4649 case MVT::i1:
4650 case MVT::i32:
4651 case MVT::i64:
4652 if (Flags.isNest()) {
4653 // The 'nest' parameter, if any, is passed in R11.
4654 Register VReg = MF.addLiveIn(PReg: PPC::X11, RC: &PPC::G8RCRegClass);
4655 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4656
4657 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4658 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4659
4660 break;
4661 }
4662
4663 // These can be scalar arguments or elements of an integer array type
4664 // passed directly. Clang may use those instead of "byval" aggregate
4665 // types to avoid forcing arguments to memory unnecessarily.
4666 if (GPR_idx != Num_GPR_Regs) {
4667 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4668 FuncInfo->addLiveInAttr(VReg, Flags);
4669 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4670
4671 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4672 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4673 // value to MVT::i64 and then truncate to the correct register size.
4674 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4675 } else {
4676 if (CallConv == CallingConv::Fast)
4677 ComputeArgOffset();
4678
4679 needsLoad = true;
4680 ArgSize = PtrByteSize;
4681 }
4682 if (CallConv != CallingConv::Fast || needsLoad)
4683 ArgOffset += 8;
4684 break;
4685
4686 case MVT::f32:
4687 case MVT::f64:
4688 // These can be scalar arguments or elements of a float array type
4689 // passed directly. The latter are used to implement ELFv2 homogenous
4690 // float aggregates.
4691 if (FPR_idx != Num_FPR_Regs) {
4692 unsigned VReg;
4693
4694 if (ObjectVT == MVT::f32)
4695 VReg = MF.addLiveIn(PReg: FPR[FPR_idx],
4696 RC: Subtarget.hasP8Vector()
4697 ? &PPC::VSSRCRegClass
4698 : &PPC::F4RCRegClass);
4699 else
4700 VReg = MF.addLiveIn(PReg: FPR[FPR_idx], RC: Subtarget.hasVSX()
4701 ? &PPC::VSFRCRegClass
4702 : &PPC::F8RCRegClass);
4703
4704 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: ObjectVT);
4705 ++FPR_idx;
4706 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
4707 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
4708 // once we support fp <-> gpr moves.
4709
4710 // This can only ever happen in the presence of f32 array types,
4711 // since otherwise we never run out of FPRs before running out
4712 // of GPRs.
4713 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4714 FuncInfo->addLiveInAttr(VReg, Flags);
4715 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4716
4717 if (ObjectVT == MVT::f32) {
4718 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
4719 ArgVal = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: ArgVal,
4720 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
4721 ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: ArgVal);
4722 }
4723
4724 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ObjectVT, Operand: ArgVal);
4725 } else {
4726 if (CallConv == CallingConv::Fast)
4727 ComputeArgOffset();
4728
4729 needsLoad = true;
4730 }
4731
4732 // When passing an array of floats, the array occupies consecutive
4733 // space in the argument area; only round up to the next doubleword
4734 // at the end of the array. Otherwise, each float takes 8 bytes.
4735 if (CallConv != CallingConv::Fast || needsLoad) {
4736 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
4737 ArgOffset += ArgSize;
4738 if (Flags.isInConsecutiveRegsLast())
4739 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4740 }
4741 break;
4742 case MVT::v4f32:
4743 case MVT::v4i32:
4744 case MVT::v8i16:
4745 case MVT::v16i8:
4746 case MVT::v2f64:
4747 case MVT::v2i64:
4748 case MVT::v1i128:
4749 case MVT::f128:
4750 // These can be scalar arguments or elements of a vector array type
4751 // passed directly. The latter are used to implement ELFv2 homogenous
4752 // vector aggregates.
4753 if (VR_idx != Num_VR_Regs) {
4754 Register VReg = MF.addLiveIn(PReg: VR[VR_idx], RC: &PPC::VRRCRegClass);
4755 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: ObjectVT);
4756 ++VR_idx;
4757 } else {
4758 if (CallConv == CallingConv::Fast)
4759 ComputeArgOffset();
4760 needsLoad = true;
4761 }
4762 if (CallConv != CallingConv::Fast || needsLoad)
4763 ArgOffset += 16;
4764 break;
4765 }
4766
4767 // We need to load the argument to a virtual register if we determined
4768 // above that we ran out of physical registers of the appropriate type.
4769 if (needsLoad) {
4770 if (ObjSize < ArgSize && !isLittleEndian)
4771 CurArgOffset += ArgSize - ObjSize;
4772 int FI = MFI.CreateFixedObject(Size: ObjSize, SPOffset: CurArgOffset, IsImmutable: isImmutable);
4773 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4774 ArgVal = DAG.getLoad(VT: ObjectVT, dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo());
4775 }
4776
4777 InVals.push_back(Elt: ArgVal);
4778 }
4779
4780 // Area that is at least reserved in the caller of this function.
4781 unsigned MinReservedArea;
4782 if (HasParameterArea)
4783 MinReservedArea = std::max(a: ArgOffset, b: LinkageSize + 8 * PtrByteSize);
4784 else
4785 MinReservedArea = LinkageSize;
4786
4787 // Set the size that is at least reserved in caller of this function. Tail
4788 // call optimized functions' reserved stack space needs to be aligned so that
4789 // taking the difference between two stack areas will result in an aligned
4790 // stack.
4791 MinReservedArea =
4792 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: MinReservedArea);
4793 FuncInfo->setMinReservedArea(MinReservedArea);
4794
4795 // If the function takes variable number of arguments, make a frame index for
4796 // the start of the first vararg value... for expansion of llvm.va_start.
4797 // On ELFv2ABI spec, it writes:
4798 // C programs that are intended to be *portable* across different compilers
4799 // and architectures must use the header file <stdarg.h> to deal with variable
4800 // argument lists.
4801 if (isVarArg && MFI.hasVAStart()) {
4802 int Depth = ArgOffset;
4803
4804 FuncInfo->setVarArgsFrameIndex(
4805 MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: Depth, IsImmutable: true));
4806 SDValue FIN = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4807
4808 // If this function is vararg, store any remaining integer argument regs
4809 // to their spots on the stack so that they may be loaded by dereferencing
4810 // the result of va_next.
4811 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4812 GPR_idx < Num_GPR_Regs; ++GPR_idx) {
4813 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx], RC: &PPC::G8RCRegClass);
4814 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4815 SDValue Store =
4816 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4817 MemOps.push_back(Elt: Store);
4818 // Increment the address by four for the next argument to store
4819 SDValue PtrOff = DAG.getConstant(Val: PtrByteSize, DL: dl, VT: PtrVT);
4820 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4821 }
4822 }
4823
4824 if (!MemOps.empty())
4825 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
4826
4827 return Chain;
4828}
4829
4830/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
4831/// adjusted to accommodate the arguments for the tailcall.
4832static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
4833 unsigned ParamSize) {
4834
4835 if (!isTailCall) return 0;
4836
4837 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
4838 unsigned CallerMinReservedArea = FI->getMinReservedArea();
4839 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
4840 // Remember only if the new adjustment is bigger.
4841 if (SPDiff < FI->getTailCallSPDelta())
4842 FI->setTailCallSPDelta(SPDiff);
4843
4844 return SPDiff;
4845}
4846
4847static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV);
4848
4849static bool callsShareTOCBase(const Function *Caller,
4850 const GlobalValue *CalleeGV,
4851 const TargetMachine &TM) {
4852 // It does not make sense to call callsShareTOCBase() with a caller that
4853 // is PC Relative since PC Relative callers do not have a TOC.
4854#ifndef NDEBUG
4855 const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
4856 assert(!STICaller->isUsingPCRelativeCalls() &&
4857 "PC Relative callers do not have a TOC and cannot share a TOC Base");
4858#endif
4859
4860 // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
4861 // don't have enough information to determine if the caller and callee share
4862 // the same TOC base, so we have to pessimistically assume they don't for
4863 // correctness.
4864 if (!CalleeGV)
4865 return false;
4866
4867 // If the callee is preemptable, then the static linker will use a plt-stub
4868 // which saves the toc to the stack, and needs a nop after the call
4869 // instruction to convert to a toc-restore.
4870 if (!TM.shouldAssumeDSOLocal(GV: CalleeGV))
4871 return false;
4872
4873 // Functions with PC Relative enabled may clobber the TOC in the same DSO.
4874 // We may need a TOC restore in the situation where the caller requires a
4875 // valid TOC but the callee is PC Relative and does not.
4876 const Function *F = dyn_cast<Function>(Val: CalleeGV);
4877 const GlobalAlias *Alias = dyn_cast<GlobalAlias>(Val: CalleeGV);
4878
4879 // If we have an Alias we can try to get the function from there.
4880 if (Alias) {
4881 const GlobalObject *GlobalObj = Alias->getAliaseeObject();
4882 F = dyn_cast<Function>(Val: GlobalObj);
4883 }
4884
4885 // If we still have no valid function pointer we do not have enough
4886 // information to determine if the callee uses PC Relative calls so we must
4887 // assume that it does.
4888 if (!F)
4889 return false;
4890
4891 // If the callee uses PC Relative we cannot guarantee that the callee won't
4892 // clobber the TOC of the caller and so we must assume that the two
4893 // functions do not share a TOC base.
4894 const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(F: *F);
4895 if (STICallee->isUsingPCRelativeCalls())
4896 return false;
4897
4898 // If the GV is not a strong definition then we need to assume it can be
4899 // replaced by another function at link time. The function that replaces
4900 // it may not share the same TOC as the caller since the callee may be
4901 // replaced by a PC Relative version of the same function.
4902 if (!CalleeGV->isStrongDefinitionForLinker())
4903 return false;
4904
4905 // The medium and large code models are expected to provide a sufficiently
4906 // large TOC to provide all data addressing needs of a module with a
4907 // single TOC.
4908 if (CodeModel::Medium == TM.getCodeModel() ||
4909 CodeModel::Large == TM.getCodeModel())
4910 return true;
4911
4912 // Any explicitly-specified sections and section prefixes must also match.
4913 // Also, if we're using -ffunction-sections, then each function is always in
4914 // a different section (the same is true for COMDAT functions).
4915 if (TM.getFunctionSections() || CalleeGV->hasComdat() ||
4916 Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection())
4917 return false;
4918 if (const auto *F = dyn_cast<Function>(Val: CalleeGV)) {
4919 if (F->getSectionPrefix() != Caller->getSectionPrefix())
4920 return false;
4921 }
4922
4923 return true;
4924}
4925
4926static bool
4927needStackSlotPassParameters(const PPCSubtarget &Subtarget,
4928 const SmallVectorImpl<ISD::OutputArg> &Outs) {
4929 assert(Subtarget.is64BitELFABI());
4930
4931 const unsigned PtrByteSize = 8;
4932 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4933
4934 static const MCPhysReg GPR[] = {
4935 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4936 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4937 };
4938 static const MCPhysReg VR[] = {
4939 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4940 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4941 };
4942
4943 const unsigned NumGPRs = std::size(GPR);
4944 const unsigned NumFPRs = 13;
4945 const unsigned NumVRs = std::size(VR);
4946 const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
4947
4948 unsigned NumBytes = LinkageSize;
4949 unsigned AvailableFPRs = NumFPRs;
4950 unsigned AvailableVRs = NumVRs;
4951
4952 for (const ISD::OutputArg& Param : Outs) {
4953 if (Param.Flags.isNest()) continue;
4954
4955 if (CalculateStackSlotUsed(ArgVT: Param.VT, OrigVT: Param.ArgVT, Flags: Param.Flags, PtrByteSize,
4956 LinkageSize, ParamAreaSize, ArgOffset&: NumBytes,
4957 AvailableFPRs, AvailableVRs))
4958 return true;
4959 }
4960 return false;
4961}
4962
4963static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
4964 if (CB.arg_size() != CallerFn->arg_size())
4965 return false;
4966
4967 auto CalleeArgIter = CB.arg_begin();
4968 auto CalleeArgEnd = CB.arg_end();
4969 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
4970
4971 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
4972 const Value* CalleeArg = *CalleeArgIter;
4973 const Value* CallerArg = &(*CallerArgIter);
4974 if (CalleeArg == CallerArg)
4975 continue;
4976
4977 // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
4978 // tail call @callee([4 x i64] undef, [4 x i64] %b)
4979 // }
4980 // 1st argument of callee is undef and has the same type as caller.
4981 if (CalleeArg->getType() == CallerArg->getType() &&
4982 isa<UndefValue>(Val: CalleeArg))
4983 continue;
4984
4985 return false;
4986 }
4987
4988 return true;
4989}
4990
4991// Returns true if TCO is possible between the callers and callees
4992// calling conventions.
4993static bool
4994areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
4995 CallingConv::ID CalleeCC) {
4996 // Tail calls are possible with fastcc and ccc.
4997 auto isTailCallableCC = [] (CallingConv::ID CC){
4998 return CC == CallingConv::C || CC == CallingConv::Fast;
4999 };
5000 if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
5001 return false;
5002
5003 // We can safely tail call both fastcc and ccc callees from a c calling
5004 // convention caller. If the caller is fastcc, we may have less stack space
5005 // than a non-fastcc caller with the same signature so disable tail-calls in
5006 // that case.
5007 return CallerCC == CallingConv::C || CallerCC == CalleeCC;
5008}
5009
5010bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
5011 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5012 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5013 const SmallVectorImpl<ISD::OutputArg> &Outs,
5014 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5015 bool isCalleeExternalSymbol) const {
5016 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
5017
5018 if (DisableSCO && !TailCallOpt) return false;
5019
5020 // Variadic argument functions are not supported.
5021 if (isVarArg) return false;
5022
5023 // Check that the calling conventions are compatible for tco.
5024 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC))
5025 return false;
5026
5027 // Caller contains any byval parameter is not supported.
5028 if (any_of(Range: Ins, P: [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
5029 return false;
5030
5031 // Callee contains any byval parameter is not supported, too.
5032 // Note: This is a quick work around, because in some cases, e.g.
5033 // caller's stack size > callee's stack size, we are still able to apply
5034 // sibling call optimization. For example, gcc is able to do SCO for caller1
5035 // in the following example, but not for caller2.
5036 // struct test {
5037 // long int a;
5038 // char ary[56];
5039 // } gTest;
5040 // __attribute__((noinline)) int callee(struct test v, struct test *b) {
5041 // b->a = v.a;
5042 // return 0;
5043 // }
5044 // void caller1(struct test a, struct test c, struct test *b) {
5045 // callee(gTest, b); }
5046 // void caller2(struct test *b) { callee(gTest, b); }
5047 if (any_of(Range: Outs, P: [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
5048 return false;
5049
5050 // If callee and caller use different calling conventions, we cannot pass
5051 // parameters on stack since offsets for the parameter area may be different.
5052 if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs))
5053 return false;
5054
5055 // All variants of 64-bit ELF ABIs without PC-Relative addressing require that
5056 // the caller and callee share the same TOC for TCO/SCO. If the caller and
5057 // callee potentially have different TOC bases then we cannot tail call since
5058 // we need to restore the TOC pointer after the call.
5059 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
5060 // We cannot guarantee this for indirect calls or calls to external functions.
5061 // When PC-Relative addressing is used, the concept of the TOC is no longer
5062 // applicable so this check is not required.
5063 // Check first for indirect calls.
5064 if (!Subtarget.isUsingPCRelativeCalls() &&
5065 !isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol)
5066 return false;
5067
5068 // Check if we share the TOC base.
5069 if (!Subtarget.isUsingPCRelativeCalls() &&
5070 !callsShareTOCBase(Caller: CallerFunc, CalleeGV, TM: getTargetMachine()))
5071 return false;
5072
5073 // TCO allows altering callee ABI, so we don't have to check further.
5074 if (CalleeCC == CallingConv::Fast && TailCallOpt)
5075 return true;
5076
5077 if (DisableSCO) return false;
5078
5079 // If callee use the same argument list that caller is using, then we can
5080 // apply SCO on this case. If it is not, then we need to check if callee needs
5081 // stack for passing arguments.
5082 // PC Relative tail calls may not have a CallBase.
5083 // If there is no CallBase we cannot verify if we have the same argument
5084 // list so assume that we don't have the same argument list.
5085 if (CB && !hasSameArgumentList(CallerFn: CallerFunc, CB: *CB) &&
5086 needStackSlotPassParameters(Subtarget, Outs))
5087 return false;
5088 else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
5089 return false;
5090
5091 return true;
5092}
5093
5094/// IsEligibleForTailCallOptimization - Check whether the call is eligible
5095/// for tail call optimization. Targets which want to do tail call
5096/// optimization should implement this function.
5097bool PPCTargetLowering::IsEligibleForTailCallOptimization(
5098 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5099 CallingConv::ID CallerCC, bool isVarArg,
5100 const SmallVectorImpl<ISD::InputArg> &Ins) const {
5101 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5102 return false;
5103
5104 // Variable argument functions are not supported.
5105 if (isVarArg)
5106 return false;
5107
5108 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
5109 // Functions containing by val parameters are not supported.
5110 if (any_of(Range: Ins, P: [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
5111 return false;
5112
5113 // Non-PIC/GOT tail calls are supported.
5114 if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
5115 return true;
5116
5117 // At the moment we can only do local tail calls (in same module, hidden
5118 // or protected) if we are generating PIC.
5119 if (CalleeGV)
5120 return CalleeGV->hasHiddenVisibility() ||
5121 CalleeGV->hasProtectedVisibility();
5122 }
5123
5124 return false;
5125}
5126
5127/// isCallCompatibleAddress - Return the immediate to use if the specified
5128/// 32-bit value is representable in the immediate field of a BxA instruction.
5129static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
5130 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
5131 if (!C) return nullptr;
5132
5133 int Addr = C->getZExtValue();
5134 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
5135 SignExtend32<26>(X: Addr) != Addr)
5136 return nullptr; // Top 6 bits have to be sext of immediate.
5137
5138 return DAG
5139 .getSignedConstant(
5140 Val: (int)C->getZExtValue() >> 2, DL: SDLoc(Op),
5141 VT: DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout()))
5142 .getNode();
5143}
5144
5145namespace {
5146
5147struct TailCallArgumentInfo {
5148 SDValue Arg;
5149 SDValue FrameIdxOp;
5150 int FrameIdx = 0;
5151
5152 TailCallArgumentInfo() = default;
5153};
5154
5155} // end anonymous namespace
5156
5157/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
5158static void StoreTailCallArgumentsToStackSlot(
5159 SelectionDAG &DAG, SDValue Chain,
5160 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
5161 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
5162 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
5163 SDValue Arg = TailCallArgs[i].Arg;
5164 SDValue FIN = TailCallArgs[i].FrameIdxOp;
5165 int FI = TailCallArgs[i].FrameIdx;
5166 // Store relative to framepointer.
5167 MemOpChains.push_back(Elt: DAG.getStore(
5168 Chain, dl, Val: Arg, Ptr: FIN,
5169 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI)));
5170 }
5171}
5172
5173/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
5174/// the appropriate stack slot for the tail call optimized function call.
5175static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
5176 SDValue OldRetAddr, SDValue OldFP,
5177 int SPDiff, const SDLoc &dl) {
5178 if (SPDiff) {
5179 // Calculate the new stack slot for the return address.
5180 MachineFunction &MF = DAG.getMachineFunction();
5181 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
5182 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
5183 int SlotSize = Subtarget.isPPC64() ? 8 : 4;
5184 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
5185 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(Size: SlotSize,
5186 SPOffset: NewRetAddrLoc, IsImmutable: true);
5187 SDValue NewRetAddrFrIdx =
5188 DAG.getFrameIndex(FI: NewRetAddr, VT: Subtarget.getScalarIntVT());
5189 Chain = DAG.getStore(Chain, dl, Val: OldRetAddr, Ptr: NewRetAddrFrIdx,
5190 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: NewRetAddr));
5191 }
5192 return Chain;
5193}
5194
5195/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
5196/// the position of the argument.
5197static void CalculateTailCallArgDest(
5198 SelectionDAG &DAG, MachineFunction &MF, bool IsPPC64, SDValue Arg,
5199 int SPDiff, unsigned ArgOffset,
5200 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5201 int Offset = ArgOffset + SPDiff;
5202 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
5203 int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
5204 EVT VT = IsPPC64 ? MVT::i64 : MVT::i32;
5205 SDValue FIN = DAG.getFrameIndex(FI, VT);
5206 TailCallArgumentInfo Info;
5207 Info.Arg = Arg;
5208 Info.FrameIdxOp = FIN;
5209 Info.FrameIdx = FI;
5210 TailCallArguments.push_back(Elt: Info);
5211}
5212
5213/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
5214/// stack slot. Returns the chain as result and the loaded frame pointers in
5215/// LROpOut/FPOpout. Used when tail calling.
5216SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
5217 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
5218 SDValue &FPOpOut, const SDLoc &dl) const {
5219 if (SPDiff) {
5220 // Load the LR and FP stack slot for later adjusting.
5221 LROpOut = getReturnAddrFrameIndex(DAG);
5222 LROpOut = DAG.getLoad(VT: Subtarget.getScalarIntVT(), dl, Chain, Ptr: LROpOut,
5223 PtrInfo: MachinePointerInfo());
5224 Chain = SDValue(LROpOut.getNode(), 1);
5225 }
5226 return Chain;
5227}
5228
5229/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
5230/// by "Src" to address "Dst" of size "Size". Alignment information is
5231/// specified by the specific parameter attribute. The copy will be passed as
5232/// a byval function parameter.
5233/// Sometimes what we are copying is the end of a larger object, the part that
5234/// does not fit in registers.
5235static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
5236 SDValue Chain, ISD::ArgFlagsTy Flags,
5237 SelectionDAG &DAG, const SDLoc &dl) {
5238 SDValue SizeNode = DAG.getConstant(Val: Flags.getByValSize(), DL: dl, VT: MVT::i32);
5239 Align Alignment = Flags.getNonZeroByValAlign();
5240 return DAG.getMemcpy(
5241 Chain, dl, Dst, Src, Size: SizeNode, DstAlign: Alignment, SrcAlign: Alignment, isVol: false, AlwaysInline: false,
5242 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
5243}
5244
5245/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
5246/// tail calls.
5247static void LowerMemOpCallTo(
5248 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
5249 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
5250 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
5251 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
5252 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
5253 if (!isTailCall) {
5254 if (isVector) {
5255 SDValue StackPtr;
5256 if (isPPC64)
5257 StackPtr = DAG.getRegister(Reg: PPC::X1, VT: MVT::i64);
5258 else
5259 StackPtr = DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
5260 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr,
5261 N2: DAG.getConstant(Val: ArgOffset, DL: dl, VT: PtrVT));
5262 }
5263 MemOpChains.push_back(
5264 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo()));
5265 // Calculate and remember argument location.
5266 } else
5267 CalculateTailCallArgDest(DAG, MF, IsPPC64: isPPC64, Arg, SPDiff, ArgOffset,
5268 TailCallArguments);
5269}
5270
5271static void
5272PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain,
5273 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
5274 SDValue FPOp,
5275 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5276 // Emit a sequence of copyto/copyfrom virtual registers for arguments that
5277 // might overwrite each other in case of tail call optimization.
5278 SmallVector<SDValue, 8> MemOpChains2;
5279 // Do not flag preceding copytoreg stuff together with the following stuff.
5280 InGlue = SDValue();
5281 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArgs: TailCallArguments,
5282 MemOpChains&: MemOpChains2, dl);
5283 if (!MemOpChains2.empty())
5284 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains2);
5285
5286 // Store the return address to the appropriate stack slot.
5287 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, OldRetAddr: LROp, OldFP: FPOp, SPDiff, dl);
5288
5289 // Emit callseq_end just before tailcall node.
5290 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL: dl);
5291 InGlue = Chain.getValue(R: 1);
5292}
5293
5294// Is this global address that of a function that can be called by name? (as
5295// opposed to something that must hold a descriptor for an indirect call).
5296static bool isFunctionGlobalAddress(const GlobalValue *GV) {
5297 if (GV) {
5298 if (GV->isThreadLocal())
5299 return false;
5300
5301 return GV->getValueType()->isFunctionTy();
5302 }
5303
5304 return false;
5305}
5306
5307SDValue PPCTargetLowering::LowerCallResult(
5308 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
5309 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5310 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5311 SmallVector<CCValAssign, 16> RVLocs;
5312 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5313 *DAG.getContext());
5314
5315 CCRetInfo.AnalyzeCallResult(
5316 Ins, Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
5317 ? RetCC_PPC_Cold
5318 : RetCC_PPC);
5319
5320 // Copy all of the result registers out of their specified physreg.
5321 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5322 CCValAssign &VA = RVLocs[i];
5323 assert(VA.isRegLoc() && "Can only return in registers!");
5324
5325 SDValue Val;
5326
5327 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
5328 SDValue Lo = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: MVT::i32,
5329 Glue: InGlue);
5330 Chain = Lo.getValue(R: 1);
5331 InGlue = Lo.getValue(R: 2);
5332 VA = RVLocs[++i]; // skip ahead to next loc
5333 SDValue Hi = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: MVT::i32,
5334 Glue: InGlue);
5335 Chain = Hi.getValue(R: 1);
5336 InGlue = Hi.getValue(R: 2);
5337 if (!Subtarget.isLittleEndian())
5338 std::swap (a&: Lo, b&: Hi);
5339 Val = DAG.getNode(Opcode: PPCISD::BUILD_SPE64, DL: dl, VT: MVT::f64, N1: Lo, N2: Hi);
5340 } else {
5341 Val = DAG.getCopyFromReg(Chain, dl,
5342 Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
5343 Chain = Val.getValue(R: 1);
5344 InGlue = Val.getValue(R: 2);
5345 }
5346
5347 switch (VA.getLocInfo()) {
5348 default: llvm_unreachable("Unknown loc info!");
5349 case CCValAssign::Full: break;
5350 case CCValAssign::AExt:
5351 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5352 break;
5353 case CCValAssign::ZExt:
5354 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: VA.getLocVT(), N1: Val,
5355 N2: DAG.getValueType(VA.getValVT()));
5356 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5357 break;
5358 case CCValAssign::SExt:
5359 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: VA.getLocVT(), N1: Val,
5360 N2: DAG.getValueType(VA.getValVT()));
5361 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5362 break;
5363 }
5364
5365 InVals.push_back(Elt: Val);
5366 }
5367
5368 return Chain;
5369}
5370
5371static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
5372 const PPCSubtarget &Subtarget, bool isPatchPoint) {
5373 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5374 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5375
5376 // PatchPoint calls are not indirect.
5377 if (isPatchPoint)
5378 return false;
5379
5380 if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Val: Callee))
5381 return false;
5382
5383 // Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
5384 // becuase the immediate function pointer points to a descriptor instead of
5385 // a function entry point. The ELFv2 ABI cannot use a BLA because the function
5386 // pointer immediate points to the global entry point, while the BLA would
5387 // need to jump to the local entry point (see rL211174).
5388 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
5389 isBLACompatibleAddress(Op: Callee, DAG))
5390 return false;
5391
5392 return true;
5393}
5394
5395// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
5396static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
5397 return Subtarget.isAIXABI() ||
5398 (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
5399}
5400
5401static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
5402 const Function &Caller, const SDValue &Callee,
5403 const PPCSubtarget &Subtarget,
5404 const TargetMachine &TM,
5405 bool IsStrictFPCall = false) {
5406 if (CFlags.IsTailCall)
5407 return PPCISD::TC_RETURN;
5408
5409 unsigned RetOpc = 0;
5410 // This is a call through a function pointer.
5411 if (CFlags.IsIndirect) {
5412 // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
5413 // indirect calls. The save of the caller's TOC pointer to the stack will be
5414 // inserted into the DAG as part of call lowering. The restore of the TOC
5415 // pointer is modeled by using a pseudo instruction for the call opcode that
5416 // represents the 2 instruction sequence of an indirect branch and link,
5417 // immediately followed by a load of the TOC pointer from the stack save
5418 // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
5419 // as it is not saved or used.
5420 if (Subtarget.usePointerGlueHelper())
5421 RetOpc = PPCISD::BL_LOAD_TOC;
5422 else
5423 RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
5424 : PPCISD::BCTRL;
5425 } else if (Subtarget.isUsingPCRelativeCalls()) {
5426 assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
5427 RetOpc = PPCISD::CALL_NOTOC;
5428 } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) {
5429 // The ABIs that maintain a TOC pointer accross calls need to have a nop
5430 // immediately following the call instruction if the caller and callee may
5431 // have different TOC bases. At link time if the linker determines the calls
5432 // may not share a TOC base, the call is redirected to a trampoline inserted
5433 // by the linker. The trampoline will (among other things) save the callers
5434 // TOC pointer at an ABI designated offset in the linkage area and the
5435 // linker will rewrite the nop to be a load of the TOC pointer from the
5436 // linkage area into gpr2.
5437 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5438 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5439 RetOpc =
5440 callsShareTOCBase(Caller: &Caller, CalleeGV: GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP;
5441 } else
5442 RetOpc = PPCISD::CALL;
5443 if (IsStrictFPCall) {
5444 switch (RetOpc) {
5445 default:
5446 llvm_unreachable("Unknown call opcode");
5447 case PPCISD::BCTRL_LOAD_TOC:
5448 RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
5449 break;
5450 case PPCISD::BCTRL:
5451 RetOpc = PPCISD::BCTRL_RM;
5452 break;
5453 case PPCISD::BL_LOAD_TOC:
5454 RetOpc = PPCISD::BL_LOAD_TOC_RM;
5455 break;
5456 case PPCISD::CALL_NOTOC:
5457 RetOpc = PPCISD::CALL_NOTOC_RM;
5458 break;
5459 case PPCISD::CALL:
5460 RetOpc = PPCISD::CALL_RM;
5461 break;
5462 case PPCISD::CALL_NOP:
5463 RetOpc = PPCISD::CALL_NOP_RM;
5464 break;
5465 }
5466 }
5467 return RetOpc;
5468}
5469
5470static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
5471 const SDLoc &dl, const PPCSubtarget &Subtarget) {
5472 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
5473 if (SDNode *Dest = isBLACompatibleAddress(Op: Callee, DAG))
5474 return SDValue(Dest, 0);
5475
5476 // Returns true if the callee is local, and false otherwise.
5477 auto isLocalCallee = [&]() {
5478 const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5479 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5480
5481 return DAG.getTarget().shouldAssumeDSOLocal(GV) &&
5482 !isa_and_nonnull<GlobalIFunc>(Val: GV);
5483 };
5484
5485 // The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
5486 // a static relocation model causes some versions of GNU LD (2.17.50, at
5487 // least) to force BSS-PLT, instead of secure-PLT, even if all objects are
5488 // built with secure-PLT.
5489 bool UsePlt =
5490 Subtarget.is32BitELFABI() && !isLocalCallee() &&
5491 Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
5492
5493 const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
5494 const TargetMachine &TM = Subtarget.getTargetMachine();
5495 const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
5496 auto *S =
5497 static_cast<MCSymbolXCOFF *>(TLOF->getFunctionEntryPointSymbol(Func: GV, TM));
5498
5499 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
5500 return DAG.getMCSymbol(Sym: S, VT: PtrVT);
5501 };
5502
5503 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5504 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5505 if (isFunctionGlobalAddress(GV)) {
5506 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val: Callee)->getGlobal();
5507
5508 if (Subtarget.isAIXABI()) {
5509 return getAIXFuncEntryPointSymbolSDNode(GV);
5510 }
5511 return DAG.getTargetGlobalAddress(GV, DL: dl, VT: Callee.getValueType(), offset: 0,
5512 TargetFlags: UsePlt ? PPCII::MO_PLT : 0);
5513 }
5514
5515 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Val: Callee)) {
5516 const char *SymName = S->getSymbol();
5517 if (Subtarget.isAIXABI()) {
5518 // If there exists a user-declared function whose name is the same as the
5519 // ExternalSymbol's, then we pick up the user-declared version.
5520 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
5521 if (const Function *F =
5522 dyn_cast_or_null<Function>(Val: Mod->getNamedValue(Name: SymName)))
5523 return getAIXFuncEntryPointSymbolSDNode(F);
5524
5525 // On AIX, direct function calls reference the symbol for the function's
5526 // entry point, which is named by prepending a "." before the function's
5527 // C-linkage name. A Qualname is returned here because an external
5528 // function entry point is a csect with XTY_ER property.
5529 const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
5530 auto &Context = DAG.getMachineFunction().getContext();
5531 MCSectionXCOFF *Sec = Context.getXCOFFSection(
5532 Section: (Twine(".") + Twine(SymName)).str(), K: SectionKind::getMetadata(),
5533 CsectProp: XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
5534 return Sec->getQualNameSymbol();
5535 };
5536
5537 SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
5538 }
5539 return DAG.getTargetExternalSymbol(Sym: SymName, VT: Callee.getValueType(),
5540 TargetFlags: UsePlt ? PPCII::MO_PLT : 0);
5541 }
5542
5543 // No transformation needed.
5544 assert(Callee.getNode() && "What no callee?");
5545 return Callee;
5546}
5547
5548static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
5549 assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
5550 "Expected a CALLSEQ_STARTSDNode.");
5551
5552 // The last operand is the chain, except when the node has glue. If the node
5553 // has glue, then the last operand is the glue, and the chain is the second
5554 // last operand.
5555 SDValue LastValue = CallSeqStart.getValue(R: CallSeqStart->getNumValues() - 1);
5556 if (LastValue.getValueType() != MVT::Glue)
5557 return LastValue;
5558
5559 return CallSeqStart.getValue(R: CallSeqStart->getNumValues() - 2);
5560}
5561
5562// Creates the node that moves a functions address into the count register
5563// to prepare for an indirect call instruction.
5564static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5565 SDValue &Glue, SDValue &Chain,
5566 const SDLoc &dl) {
5567 SDValue MTCTROps[] = {Chain, Callee, Glue};
5568 EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
5569 Chain = DAG.getNode(Opcode: PPCISD::MTCTR, DL: dl, ResultTys: ReturnTypes,
5570 Ops: ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
5571 // The glue is the second value produced.
5572 Glue = Chain.getValue(R: 1);
5573}
5574
5575static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5576 SDValue &Glue, SDValue &Chain,
5577 SDValue CallSeqStart,
5578 const CallBase *CB, const SDLoc &dl,
5579 bool hasNest,
5580 const PPCSubtarget &Subtarget) {
5581 // Function pointers in the 64-bit SVR4 ABI do not point to the function
5582 // entry point, but to the function descriptor (the function entry point
5583 // address is part of the function descriptor though).
5584 // The function descriptor is a three doubleword structure with the
5585 // following fields: function entry point, TOC base address and
5586 // environment pointer.
5587 // Thus for a call through a function pointer, the following actions need
5588 // to be performed:
5589 // 1. Save the TOC of the caller in the TOC save area of its stack
5590 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
5591 // 2. Load the address of the function entry point from the function
5592 // descriptor.
5593 // 3. Load the TOC of the callee from the function descriptor into r2.
5594 // 4. Load the environment pointer from the function descriptor into
5595 // r11.
5596 // 5. Branch to the function entry point address.
5597 // 6. On return of the callee, the TOC of the caller needs to be
5598 // restored (this is done in FinishCall()).
5599 //
5600 // The loads are scheduled at the beginning of the call sequence, and the
5601 // register copies are flagged together to ensure that no other
5602 // operations can be scheduled in between. E.g. without flagging the
5603 // copies together, a TOC access in the caller could be scheduled between
5604 // the assignment of the callee TOC and the branch to the callee, which leads
5605 // to incorrect code.
5606
5607 // Start by loading the function address from the descriptor.
5608 SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
5609 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
5610 ? (MachineMemOperand::MODereferenceable |
5611 MachineMemOperand::MOInvariant)
5612 : MachineMemOperand::MONone;
5613
5614 MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
5615
5616 // Registers used in building the DAG.
5617 const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
5618 const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
5619
5620 // Offsets of descriptor members.
5621 const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
5622 const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
5623
5624 const MVT RegVT = Subtarget.getScalarIntVT();
5625 const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
5626
5627 // One load for the functions entry point address.
5628 SDValue LoadFuncPtr = DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: Callee, PtrInfo: MPI,
5629 Alignment, MMOFlags);
5630
5631 // One for loading the TOC anchor for the module that contains the called
5632 // function.
5633 SDValue TOCOff = DAG.getIntPtrConstant(Val: TOCAnchorOffset, DL: dl);
5634 SDValue AddTOC = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: Callee, N2: TOCOff);
5635 SDValue TOCPtr =
5636 DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: AddTOC,
5637 PtrInfo: MPI.getWithOffset(O: TOCAnchorOffset), Alignment, MMOFlags);
5638
5639 // One for loading the environment pointer.
5640 SDValue PtrOff = DAG.getIntPtrConstant(Val: EnvPtrOffset, DL: dl);
5641 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: Callee, N2: PtrOff);
5642 SDValue LoadEnvPtr =
5643 DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: AddPtr,
5644 PtrInfo: MPI.getWithOffset(O: EnvPtrOffset), Alignment, MMOFlags);
5645
5646
5647 // Then copy the newly loaded TOC anchor to the TOC pointer.
5648 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, Reg: TOCReg, N: TOCPtr, Glue);
5649 Chain = TOCVal.getValue(R: 0);
5650 Glue = TOCVal.getValue(R: 1);
5651
5652 // If the function call has an explicit 'nest' parameter, it takes the
5653 // place of the environment pointer.
5654 assert((!hasNest || !Subtarget.isAIXABI()) &&
5655 "Nest parameter is not supported on AIX.");
5656 if (!hasNest) {
5657 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, Reg: EnvPtrReg, N: LoadEnvPtr, Glue);
5658 Chain = EnvVal.getValue(R: 0);
5659 Glue = EnvVal.getValue(R: 1);
5660 }
5661
5662 // The rest of the indirect call sequence is the same as the non-descriptor
5663 // DAG.
5664 prepareIndirectCall(DAG, Callee&: LoadFuncPtr, Glue, Chain, dl);
5665}
5666
5667static void prepareOutOfLineGlueCall(SelectionDAG &DAG, SDValue &Callee,
5668 SDValue &Glue, SDValue &Chain,
5669 SDValue CallSeqStart, const CallBase *CB,
5670 const SDLoc &dl, bool hasNest,
5671 const PPCSubtarget &Subtarget) {
5672 // On AIX there is a feature ("out of line glue code") which uses a special
5673 // trampoline function ._ptrgl to do the indirect call. If this option is
5674 // enabled we instead simply load the address of the descriptor into gpr11,
5675 // with the arguments in the 'normal' registers and branch to the ._ptrgl
5676 // stub.
5677 const MCRegister PtrGlueReg = Subtarget.getGlueCodeDescriptorRegister();
5678 SDValue MoveToPhysicalReg =
5679 DAG.getCopyToReg(Chain, dl, Reg: PtrGlueReg, N: Callee, Glue);
5680 Chain = MoveToPhysicalReg.getValue(R: 0);
5681 Glue = MoveToPhysicalReg.getValue(R: 1);
5682}
5683
5684static void
5685buildCallOperands(SmallVectorImpl<SDValue> &Ops,
5686 PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
5687 SelectionDAG &DAG,
5688 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
5689 SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
5690 const PPCSubtarget &Subtarget) {
5691 const bool IsPPC64 = Subtarget.isPPC64();
5692 // MVT for a general purpose register.
5693 const MVT RegVT = Subtarget.getScalarIntVT();
5694
5695 // First operand is always the chain.
5696 Ops.push_back(Elt: Chain);
5697
5698 // If it's a direct call pass the callee as the second operand.
5699 if (!CFlags.IsIndirect)
5700 Ops.push_back(Elt: Callee);
5701 else if (Subtarget.usePointerGlueHelper()) {
5702 Ops.push_back(Elt: Callee);
5703 // Add the register used to pass the descriptor address.
5704 Ops.push_back(
5705 Elt: DAG.getRegister(Reg: Subtarget.getGlueCodeDescriptorRegister(), VT: RegVT));
5706 } else {
5707 assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
5708
5709 // For the TOC based ABIs, we have saved the TOC pointer to the linkage area
5710 // on the stack (this would have been done in `LowerCall_64SVR4` or
5711 // `LowerCall_AIX`). The call instruction is a pseudo instruction that
5712 // represents both the indirect branch and a load that restores the TOC
5713 // pointer from the linkage area. The operand for the TOC restore is an add
5714 // of the TOC save offset to the stack pointer. This must be the second
5715 // operand: after the chain input but before any other variadic arguments.
5716 // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
5717 // saved or used.
5718 if (isTOCSaveRestoreRequired(Subtarget)) {
5719 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
5720
5721 SDValue StackPtr = DAG.getRegister(Reg: StackPtrReg, VT: RegVT);
5722 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
5723 SDValue TOCOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
5724 SDValue AddTOC = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: StackPtr, N2: TOCOff);
5725 Ops.push_back(Elt: AddTOC);
5726 }
5727
5728 // Add the register used for the environment pointer.
5729 if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
5730 Ops.push_back(Elt: DAG.getRegister(Reg: Subtarget.getEnvironmentPointerRegister(),
5731 VT: RegVT));
5732
5733
5734 // Add CTR register as callee so a bctr can be emitted later.
5735 if (CFlags.IsTailCall)
5736 Ops.push_back(Elt: DAG.getRegister(Reg: IsPPC64 ? PPC::CTR8 : PPC::CTR, VT: RegVT));
5737 }
5738
5739 // If this is a tail call add stack pointer delta.
5740 if (CFlags.IsTailCall)
5741 Ops.push_back(Elt: DAG.getConstant(Val: SPDiff, DL: dl, VT: MVT::i32));
5742
5743 // Add argument registers to the end of the list so that they are known live
5744 // into the call.
5745 for (const auto &[Reg, N] : RegsToPass)
5746 Ops.push_back(Elt: DAG.getRegister(Reg, VT: N.getValueType()));
5747
5748 // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
5749 // no way to mark dependencies as implicit here.
5750 // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
5751 if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
5752 !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
5753 Ops.push_back(Elt: DAG.getRegister(Reg: Subtarget.getTOCPointerRegister(), VT: RegVT));
5754
5755 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
5756 if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
5757 Ops.push_back(Elt: DAG.getRegister(Reg: PPC::CR1EQ, VT: MVT::i32));
5758
5759 // Add a register mask operand representing the call-preserved registers.
5760 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
5761 const uint32_t *Mask =
5762 TRI->getCallPreservedMask(MF: DAG.getMachineFunction(), CFlags.CallConv);
5763 assert(Mask && "Missing call preserved mask for calling convention");
5764 Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
5765
5766 // If the glue is valid, it is the last operand.
5767 if (Glue.getNode())
5768 Ops.push_back(Elt: Glue);
5769}
5770
5771SDValue PPCTargetLowering::FinishCall(
5772 CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
5773 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
5774 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
5775 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
5776 SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
5777
5778 if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
5779 Subtarget.isAIXABI())
5780 setUsesTOCBasePtr(DAG);
5781
5782 unsigned CallOpc =
5783 getCallOpcode(CFlags, Caller: DAG.getMachineFunction().getFunction(), Callee,
5784 Subtarget, TM: DAG.getTarget(), IsStrictFPCall: CB ? CB->isStrictFP() : false);
5785
5786 if (!CFlags.IsIndirect)
5787 Callee = transformCallee(Callee, DAG, dl, Subtarget);
5788 else if (Subtarget.usesFunctionDescriptors()) {
5789 if (Subtarget.usePointerGlueHelper()) {
5790 prepareOutOfLineGlueCall(DAG, Callee, Glue, Chain, CallSeqStart, CB, dl,
5791 hasNest: CFlags.HasNest, Subtarget);
5792 SDValue PtrGlueCallee =
5793 DAG.getExternalSymbol(Sym: "_ptrgl", VT: getPointerTy(DL: DAG.getDataLayout()));
5794 Callee = transformCallee(Callee: PtrGlueCallee, DAG, dl, Subtarget);
5795 } else {
5796 prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
5797 dl, hasNest: CFlags.HasNest, Subtarget);
5798 }
5799 } else {
5800 prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
5801 }
5802
5803 // Build the operand list for the call instruction.
5804 SmallVector<SDValue, 8> Ops;
5805 buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
5806 SPDiff, Subtarget);
5807
5808 // Emit tail call.
5809 if (CFlags.IsTailCall) {
5810 // Indirect tail call when using PC Relative calls do not have the same
5811 // constraints.
5812 assert(((Callee.getOpcode() == ISD::Register &&
5813 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
5814 Callee.getOpcode() == ISD::TargetExternalSymbol ||
5815 Callee.getOpcode() == ISD::TargetGlobalAddress ||
5816 isa<ConstantSDNode>(Callee) ||
5817 (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
5818 "Expecting a global address, external symbol, absolute value, "
5819 "register or an indirect tail call when PC Relative calls are "
5820 "used.");
5821 // PC Relative calls also use TC_RETURN as the way to mark tail calls.
5822 assert(CallOpc == PPCISD::TC_RETURN &&
5823 "Unexpected call opcode for a tail call.");
5824 DAG.getMachineFunction().getFrameInfo().setHasTailCall();
5825 SDValue Ret = DAG.getNode(Opcode: CallOpc, DL: dl, VT: MVT::Other, Ops);
5826 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CFlags.NoMerge);
5827 return Ret;
5828 }
5829
5830 std::array<EVT, 2> ReturnTypes = {._M_elems: {MVT::Other, MVT::Glue}};
5831 Chain = DAG.getNode(Opcode: CallOpc, DL: dl, ResultTys: ReturnTypes, Ops);
5832 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CFlags.NoMerge);
5833 Glue = Chain.getValue(R: 1);
5834
5835 // When performing tail call optimization the callee pops its arguments off
5836 // the stack. Account for this here so these bytes can be pushed back on in
5837 // PPCFrameLowering::eliminateCallFramePseudoInstr.
5838 int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
5839 getTargetMachine().Options.GuaranteedTailCallOpt)
5840 ? NumBytes
5841 : 0;
5842
5843 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: BytesCalleePops, Glue, DL: dl);
5844 Glue = Chain.getValue(R: 1);
5845
5846 return LowerCallResult(Chain, InGlue: Glue, CallConv: CFlags.CallConv, isVarArg: CFlags.IsVarArg, Ins, dl,
5847 DAG, InVals);
5848}
5849
5850bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {
5851 CallingConv::ID CalleeCC = CB->getCallingConv();
5852 const Function *CallerFunc = CB->getCaller();
5853 CallingConv::ID CallerCC = CallerFunc->getCallingConv();
5854 const Function *CalleeFunc = CB->getCalledFunction();
5855 if (!CalleeFunc)
5856 return false;
5857 const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(Val: CalleeFunc);
5858
5859 SmallVector<ISD::OutputArg, 2> Outs;
5860 SmallVector<ISD::InputArg, 2> Ins;
5861
5862 GetReturnInfo(CC: CalleeCC, ReturnType: CalleeFunc->getReturnType(),
5863 attr: CalleeFunc->getAttributes(), Outs, TLI: *this,
5864 DL: CalleeFunc->getDataLayout());
5865
5866 return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,
5867 isVarArg: CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,
5868 isCalleeExternalSymbol: false /*isCalleeExternalSymbol*/);
5869}
5870
5871bool PPCTargetLowering::isEligibleForTCO(
5872 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5873 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5874 const SmallVectorImpl<ISD::OutputArg> &Outs,
5875 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5876 bool isCalleeExternalSymbol) const {
5877 if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
5878 return false;
5879
5880 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
5881 return IsEligibleForTailCallOptimization_64SVR4(
5882 CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc,
5883 isCalleeExternalSymbol);
5884 else
5885 return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC,
5886 isVarArg, Ins);
5887}
5888
5889SDValue
5890PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
5891 SmallVectorImpl<SDValue> &InVals) const {
5892 SelectionDAG &DAG = CLI.DAG;
5893 SDLoc &dl = CLI.DL;
5894 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
5895 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
5896 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
5897 SDValue Chain = CLI.Chain;
5898 SDValue Callee = CLI.Callee;
5899 bool &isTailCall = CLI.IsTailCall;
5900 CallingConv::ID CallConv = CLI.CallConv;
5901 bool isVarArg = CLI.IsVarArg;
5902 bool isPatchPoint = CLI.IsPatchPoint;
5903 const CallBase *CB = CLI.CB;
5904
5905 if (isTailCall) {
5906 MachineFunction &MF = DAG.getMachineFunction();
5907 CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
5908 auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
5909 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5910 bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Val: Callee);
5911
5912 isTailCall =
5913 isEligibleForTCO(CalleeGV: GV, CalleeCC: CallConv, CallerCC, CB, isVarArg, Outs, Ins,
5914 CallerFunc: &(MF.getFunction()), isCalleeExternalSymbol: IsCalleeExternalSymbol);
5915 if (isTailCall) {
5916 ++NumTailCalls;
5917 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5918 ++NumSiblingCalls;
5919
5920 // PC Relative calls no longer guarantee that the callee is a Global
5921 // Address Node. The callee could be an indirect tail call in which
5922 // case the SDValue for the callee could be a load (to load the address
5923 // of a function pointer) or it may be a register copy (to move the
5924 // address of the callee from a function parameter into a virtual
5925 // register). It may also be an ExternalSymbolSDNode (ex memcopy).
5926 assert((Subtarget.isUsingPCRelativeCalls() ||
5927 isa<GlobalAddressSDNode>(Callee)) &&
5928 "Callee should be an llvm::Function object.");
5929
5930 LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
5931 << "\nTCO callee: ");
5932 LLVM_DEBUG(Callee.dump());
5933 }
5934 }
5935
5936 if (!isTailCall && CB && CB->isMustTailCall())
5937 report_fatal_error(reason: "failed to perform tail call elimination on a call "
5938 "site marked musttail");
5939
5940 // When long calls (i.e. indirect calls) are always used, calls are always
5941 // made via function pointer. If we have a function name, first translate it
5942 // into a pointer.
5943 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Val: Callee) &&
5944 !isTailCall)
5945 Callee = LowerGlobalAddress(Op: Callee, DAG);
5946
5947 CallFlags CFlags(
5948 CallConv, isTailCall, isVarArg, isPatchPoint,
5949 isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
5950 // hasNest
5951 Subtarget.is64BitELFABI() &&
5952 any_of(Range&: Outs, P: [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
5953 CLI.NoMerge);
5954
5955 if (Subtarget.isAIXABI())
5956 return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5957 InVals, CB);
5958
5959 assert(Subtarget.isSVR4ABI());
5960 if (Subtarget.isPPC64())
5961 return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5962 InVals, CB);
5963 return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5964 InVals, CB);
5965}
5966
5967SDValue PPCTargetLowering::LowerCall_32SVR4(
5968 SDValue Chain, SDValue Callee, CallFlags CFlags,
5969 const SmallVectorImpl<ISD::OutputArg> &Outs,
5970 const SmallVectorImpl<SDValue> &OutVals,
5971 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5972 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
5973 const CallBase *CB) const {
5974 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
5975 // of the 32-bit SVR4 ABI stack frame layout.
5976
5977 const CallingConv::ID CallConv = CFlags.CallConv;
5978 const bool IsVarArg = CFlags.IsVarArg;
5979 const bool IsTailCall = CFlags.IsTailCall;
5980
5981 assert((CallConv == CallingConv::C ||
5982 CallConv == CallingConv::Cold ||
5983 CallConv == CallingConv::Fast) && "Unknown calling convention!");
5984
5985 const Align PtrAlign(4);
5986
5987 MachineFunction &MF = DAG.getMachineFunction();
5988
5989 // Mark this function as potentially containing a function that contains a
5990 // tail call. As a consequence the frame pointer will be used for dynamicalloc
5991 // and restoring the callers stack pointer in this functions epilog. This is
5992 // done because by tail calling the called function might overwrite the value
5993 // in this function's (MF) stack pointer stack slot 0(SP).
5994 if (getTargetMachine().Options.GuaranteedTailCallOpt &&
5995 CallConv == CallingConv::Fast)
5996 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
5997
5998 // Count how many bytes are to be pushed on the stack, including the linkage
5999 // area, parameter list area and the part of the local variable space which
6000 // contains copies of aggregates which are passed by value.
6001
6002 // Assign locations to all of the outgoing arguments.
6003 SmallVector<CCValAssign, 16> ArgLocs;
6004 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
6005
6006 // Reserve space for the linkage area on the stack.
6007 CCInfo.AllocateStack(Size: Subtarget.getFrameLowering()->getLinkageSize(),
6008 Alignment: PtrAlign);
6009
6010 if (IsVarArg) {
6011 // Handle fixed and variable vector arguments differently.
6012 // Fixed vector arguments go into registers as long as registers are
6013 // available. Variable vector arguments always go into memory.
6014 unsigned NumArgs = Outs.size();
6015
6016 for (unsigned i = 0; i != NumArgs; ++i) {
6017 MVT ArgVT = Outs[i].VT;
6018 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
6019 bool Result;
6020
6021 if (!ArgFlags.isVarArg()) {
6022 Result = CC_PPC32_SVR4(ValNo: i, ValVT: ArgVT, LocVT: ArgVT, LocInfo: CCValAssign::Full, ArgFlags,
6023 OrigTy: Outs[i].OrigTy, State&: CCInfo);
6024 } else {
6025 Result = CC_PPC32_SVR4_VarArg(ValNo: i, ValVT: ArgVT, LocVT: ArgVT, LocInfo: CCValAssign::Full,
6026 ArgFlags, OrigTy: Outs[i].OrigTy, State&: CCInfo);
6027 }
6028
6029 if (Result) {
6030#ifndef NDEBUG
6031 errs() << "Call operand #" << i << " has unhandled type "
6032 << ArgVT << "\n";
6033#endif
6034 llvm_unreachable(nullptr);
6035 }
6036 }
6037 } else {
6038 // All arguments are treated the same.
6039 CCInfo.AnalyzeCallOperands(Outs, Fn: CC_PPC32_SVR4);
6040 }
6041
6042 // Assign locations to all of the outgoing aggregate by value arguments.
6043 SmallVector<CCValAssign, 16> ByValArgLocs;
6044 CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
6045
6046 // Reserve stack space for the allocations in CCInfo.
6047 CCByValInfo.AllocateStack(Size: CCInfo.getStackSize(), Alignment: PtrAlign);
6048
6049 CCByValInfo.AnalyzeCallOperands(Outs, Fn: CC_PPC32_SVR4_ByVal);
6050
6051 // Size of the linkage area, parameter list area and the part of the local
6052 // space variable where copies of aggregates which are passed by value are
6053 // stored.
6054 unsigned NumBytes = CCByValInfo.getStackSize();
6055
6056 // Calculate by how many bytes the stack has to be adjusted in case of tail
6057 // call optimization.
6058 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall: IsTailCall, ParamSize: NumBytes);
6059
6060 // Adjust the stack pointer for the new arguments...
6061 // These operations are automatically eliminated by the prolog/epilog pass
6062 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
6063 SDValue CallSeqStart = Chain;
6064
6065 // Load the return address and frame pointer so it can be moved somewhere else
6066 // later.
6067 SDValue LROp, FPOp;
6068 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROpOut&: LROp, FPOpOut&: FPOp, dl);
6069
6070 // Set up a copy of the stack pointer for use loading and storing any
6071 // arguments that may not fit in the registers available for argument
6072 // passing.
6073 SDValue StackPtr = DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
6074
6075 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6076 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6077 SmallVector<SDValue, 8> MemOpChains;
6078
6079 bool seenFloatArg = false;
6080 // Walk the register/memloc assignments, inserting copies/loads.
6081 // i - Tracks the index into the list of registers allocated for the call
6082 // RealArgIdx - Tracks the index into the list of actual function arguments
6083 // j - Tracks the index into the list of byval arguments
6084 for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
6085 i != e;
6086 ++i, ++RealArgIdx) {
6087 CCValAssign &VA = ArgLocs[i];
6088 SDValue Arg = OutVals[RealArgIdx];
6089 ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
6090
6091 if (Flags.isByVal()) {
6092 // Argument is an aggregate which is passed by value, thus we need to
6093 // create a copy of it in the local variable space of the current stack
6094 // frame (which is the stack frame of the caller) and pass the address of
6095 // this copy to the callee.
6096 assert((j < ByValArgLocs.size()) && "Index out of bounds!");
6097 CCValAssign &ByValVA = ByValArgLocs[j++];
6098 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
6099
6100 // Memory reserved in the local variable space of the callers stack frame.
6101 unsigned LocMemOffset = ByValVA.getLocMemOffset();
6102
6103 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
6104 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()),
6105 N1: StackPtr, N2: PtrOff);
6106
6107 // Create a copy of the argument in the local area of the current
6108 // stack frame.
6109 SDValue MemcpyCall =
6110 CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff,
6111 Chain: CallSeqStart.getNode()->getOperand(Num: 0),
6112 Flags, DAG, dl);
6113
6114 // This must go outside the CALLSEQ_START..END.
6115 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(Chain: MemcpyCall, InSize: NumBytes, OutSize: 0,
6116 DL: SDLoc(MemcpyCall));
6117 DAG.ReplaceAllUsesWith(From: CallSeqStart.getNode(),
6118 To: NewCallSeqStart.getNode());
6119 Chain = CallSeqStart = NewCallSeqStart;
6120
6121 // Pass the address of the aggregate copy on the stack either in a
6122 // physical register or in the parameter list area of the current stack
6123 // frame to the callee.
6124 Arg = PtrOff;
6125 }
6126
6127 // When useCRBits() is true, there can be i1 arguments.
6128 // It is because getRegisterType(MVT::i1) => MVT::i1,
6129 // and for other integer types getRegisterType() => MVT::i32.
6130 // Extend i1 and ensure callee will get i32.
6131 if (Arg.getValueType() == MVT::i1)
6132 Arg = DAG.getNode(Opcode: Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
6133 DL: dl, VT: MVT::i32, Operand: Arg);
6134
6135 if (VA.isRegLoc()) {
6136 seenFloatArg |= VA.getLocVT().isFloatingPoint();
6137 // Put argument in a physical register.
6138 if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
6139 bool IsLE = Subtarget.isLittleEndian();
6140 SDValue SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
6141 N2: DAG.getIntPtrConstant(Val: IsLE ? 0 : 1, DL: dl));
6142 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y: SVal.getValue(R: 0)));
6143 SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
6144 N2: DAG.getIntPtrConstant(Val: IsLE ? 1 : 0, DL: dl));
6145 RegsToPass.push_back(Elt: std::make_pair(x: ArgLocs[++i].getLocReg(),
6146 y: SVal.getValue(R: 0)));
6147 } else
6148 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
6149 } else {
6150 // Put argument in the parameter list area of the current stack frame.
6151 assert(VA.isMemLoc());
6152 unsigned LocMemOffset = VA.getLocMemOffset();
6153
6154 if (!IsTailCall) {
6155 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
6156 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()),
6157 N1: StackPtr, N2: PtrOff);
6158
6159 MemOpChains.push_back(
6160 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo()));
6161 } else {
6162 // Calculate and remember argument location.
6163 CalculateTailCallArgDest(DAG, MF, IsPPC64: false, Arg, SPDiff, ArgOffset: LocMemOffset,
6164 TailCallArguments);
6165 }
6166 }
6167 }
6168
6169 if (!MemOpChains.empty())
6170 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
6171
6172 // Build a sequence of copy-to-reg nodes chained together with token chain
6173 // and flag operands which copy the outgoing args into the appropriate regs.
6174 SDValue InGlue;
6175 for (const auto &[Reg, N] : RegsToPass) {
6176 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
6177 InGlue = Chain.getValue(R: 1);
6178 }
6179
6180 // Set CR bit 6 to true if this is a vararg call with floating args passed in
6181 // registers.
6182 if (IsVarArg) {
6183 SDVTList VTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6184 SDValue Ops[] = { Chain, InGlue };
6185
6186 Chain = DAG.getNode(Opcode: seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, DL: dl,
6187 VTList: VTs, Ops: ArrayRef(Ops, InGlue.getNode() ? 2 : 1));
6188
6189 InGlue = Chain.getValue(R: 1);
6190 }
6191
6192 if (IsTailCall)
6193 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6194 TailCallArguments);
6195
6196 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
6197 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6198}
6199
6200// Copy an argument into memory, being careful to do this outside the
6201// call sequence for the call to which the argument belongs.
6202SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
6203 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
6204 SelectionDAG &DAG, const SDLoc &dl) const {
6205 SDValue MemcpyCall = CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff,
6206 Chain: CallSeqStart.getNode()->getOperand(Num: 0),
6207 Flags, DAG, dl);
6208 // The MEMCPY must go outside the CALLSEQ_START..END.
6209 int64_t FrameSize = CallSeqStart.getConstantOperandVal(i: 1);
6210 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(Chain: MemcpyCall, InSize: FrameSize, OutSize: 0,
6211 DL: SDLoc(MemcpyCall));
6212 DAG.ReplaceAllUsesWith(From: CallSeqStart.getNode(),
6213 To: NewCallSeqStart.getNode());
6214 return NewCallSeqStart;
6215}
6216
6217SDValue PPCTargetLowering::LowerCall_64SVR4(
6218 SDValue Chain, SDValue Callee, CallFlags CFlags,
6219 const SmallVectorImpl<ISD::OutputArg> &Outs,
6220 const SmallVectorImpl<SDValue> &OutVals,
6221 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
6222 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
6223 const CallBase *CB) const {
6224 bool isELFv2ABI = Subtarget.isELFv2ABI();
6225 bool isLittleEndian = Subtarget.isLittleEndian();
6226 unsigned NumOps = Outs.size();
6227 bool IsSibCall = false;
6228 bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
6229
6230 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6231 unsigned PtrByteSize = 8;
6232
6233 MachineFunction &MF = DAG.getMachineFunction();
6234
6235 if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
6236 IsSibCall = true;
6237
6238 // Mark this function as potentially containing a function that contains a
6239 // tail call. As a consequence the frame pointer will be used for dynamicalloc
6240 // and restoring the callers stack pointer in this functions epilog. This is
6241 // done because by tail calling the called function might overwrite the value
6242 // in this function's (MF) stack pointer stack slot 0(SP).
6243 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6244 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
6245
6246 assert(!(IsFastCall && CFlags.IsVarArg) &&
6247 "fastcc not supported on varargs functions");
6248
6249 // Count how many bytes are to be pushed on the stack, including the linkage
6250 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
6251 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
6252 // area is 32 bytes reserved space for [SP][CR][LR][TOC].
6253 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
6254 unsigned NumBytes = LinkageSize;
6255 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
6256
6257 static const MCPhysReg GPR[] = {
6258 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6259 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
6260 };
6261 static const MCPhysReg VR[] = {
6262 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
6263 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
6264 };
6265
6266 const unsigned NumGPRs = std::size(GPR);
6267 const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
6268 const unsigned NumVRs = std::size(VR);
6269
6270 // On ELFv2, we can avoid allocating the parameter area if all the arguments
6271 // can be passed to the callee in registers.
6272 // For the fast calling convention, there is another check below.
6273 // Note: We should keep consistent with LowerFormalArguments_64SVR4()
6274 bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
6275 if (!HasParameterArea) {
6276 unsigned ParamAreaSize = NumGPRs * PtrByteSize;
6277 unsigned AvailableFPRs = NumFPRs;
6278 unsigned AvailableVRs = NumVRs;
6279 unsigned NumBytesTmp = NumBytes;
6280 for (unsigned i = 0; i != NumOps; ++i) {
6281 if (Outs[i].Flags.isNest()) continue;
6282 if (CalculateStackSlotUsed(ArgVT: Outs[i].VT, OrigVT: Outs[i].ArgVT, Flags: Outs[i].Flags,
6283 PtrByteSize, LinkageSize, ParamAreaSize,
6284 ArgOffset&: NumBytesTmp, AvailableFPRs, AvailableVRs))
6285 HasParameterArea = true;
6286 }
6287 }
6288
6289 // When using the fast calling convention, we don't provide backing for
6290 // arguments that will be in registers.
6291 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
6292
6293 // Avoid allocating parameter area for fastcc functions if all the arguments
6294 // can be passed in the registers.
6295 if (IsFastCall)
6296 HasParameterArea = false;
6297
6298 // Add up all the space actually used.
6299 for (unsigned i = 0; i != NumOps; ++i) {
6300 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6301 EVT ArgVT = Outs[i].VT;
6302 EVT OrigVT = Outs[i].ArgVT;
6303
6304 if (Flags.isNest())
6305 continue;
6306
6307 if (IsFastCall) {
6308 if (Flags.isByVal()) {
6309 NumGPRsUsed += (Flags.getByValSize()+7)/8;
6310 if (NumGPRsUsed > NumGPRs)
6311 HasParameterArea = true;
6312 } else {
6313 switch (ArgVT.getSimpleVT().SimpleTy) {
6314 default: llvm_unreachable("Unexpected ValueType for argument!");
6315 case MVT::i1:
6316 case MVT::i32:
6317 case MVT::i64:
6318 if (++NumGPRsUsed <= NumGPRs)
6319 continue;
6320 break;
6321 case MVT::v4i32:
6322 case MVT::v8i16:
6323 case MVT::v16i8:
6324 case MVT::v2f64:
6325 case MVT::v2i64:
6326 case MVT::v1i128:
6327 case MVT::f128:
6328 if (++NumVRsUsed <= NumVRs)
6329 continue;
6330 break;
6331 case MVT::v4f32:
6332 if (++NumVRsUsed <= NumVRs)
6333 continue;
6334 break;
6335 case MVT::f32:
6336 case MVT::f64:
6337 if (++NumFPRsUsed <= NumFPRs)
6338 continue;
6339 break;
6340 }
6341 HasParameterArea = true;
6342 }
6343 }
6344
6345 /* Respect alignment of argument on the stack. */
6346 auto Alignement =
6347 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6348 NumBytes = alignTo(Size: NumBytes, A: Alignement);
6349
6350 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
6351 if (Flags.isInConsecutiveRegsLast())
6352 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6353 }
6354
6355 unsigned NumBytesActuallyUsed = NumBytes;
6356
6357 // In the old ELFv1 ABI,
6358 // the prolog code of the callee may store up to 8 GPR argument registers to
6359 // the stack, allowing va_start to index over them in memory if its varargs.
6360 // Because we cannot tell if this is needed on the caller side, we have to
6361 // conservatively assume that it is needed. As such, make sure we have at
6362 // least enough stack space for the caller to store the 8 GPRs.
6363 // In the ELFv2 ABI, we allocate the parameter area iff a callee
6364 // really requires memory operands, e.g. a vararg function.
6365 if (HasParameterArea)
6366 NumBytes = std::max(a: NumBytes, b: LinkageSize + 8 * PtrByteSize);
6367 else
6368 NumBytes = LinkageSize;
6369
6370 // Tail call needs the stack to be aligned.
6371 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6372 NumBytes = EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes);
6373
6374 int SPDiff = 0;
6375
6376 // Calculate by how many bytes the stack has to be adjusted in case of tail
6377 // call optimization.
6378 if (!IsSibCall)
6379 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall: CFlags.IsTailCall, ParamSize: NumBytes);
6380
6381 // To protect arguments on the stack from being clobbered in a tail call,
6382 // force all the loads to happen before doing any other lowering.
6383 if (CFlags.IsTailCall)
6384 Chain = DAG.getStackArgumentTokenFactor(Chain);
6385
6386 // Adjust the stack pointer for the new arguments...
6387 // These operations are automatically eliminated by the prolog/epilog pass
6388 if (!IsSibCall)
6389 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
6390 SDValue CallSeqStart = Chain;
6391
6392 // Load the return address and frame pointer so it can be move somewhere else
6393 // later.
6394 SDValue LROp, FPOp;
6395 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROpOut&: LROp, FPOpOut&: FPOp, dl);
6396
6397 // Set up a copy of the stack pointer for use loading and storing any
6398 // arguments that may not fit in the registers available for argument
6399 // passing.
6400 SDValue StackPtr = DAG.getRegister(Reg: PPC::X1, VT: MVT::i64);
6401
6402 // Figure out which arguments are going to go in registers, and which in
6403 // memory. Also, if this is a vararg function, floating point operations
6404 // must be stored to our stack, and loaded into integer regs as well, if
6405 // any integer regs are available for argument passing.
6406 unsigned ArgOffset = LinkageSize;
6407
6408 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6409 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6410
6411 SmallVector<SDValue, 8> MemOpChains;
6412 for (unsigned i = 0; i != NumOps; ++i) {
6413 SDValue Arg = OutVals[i];
6414 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6415 EVT ArgVT = Outs[i].VT;
6416 EVT OrigVT = Outs[i].ArgVT;
6417
6418 // PtrOff will be used to store the current argument to the stack if a
6419 // register cannot be found for it.
6420 SDValue PtrOff;
6421
6422 // We re-align the argument offset for each argument, except when using the
6423 // fast calling convention, when we need to make sure we do that only when
6424 // we'll actually use a stack slot.
6425 auto ComputePtrOff = [&]() {
6426 /* Respect alignment of argument on the stack. */
6427 auto Alignment =
6428 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6429 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
6430
6431 PtrOff = DAG.getConstant(Val: ArgOffset, DL: dl, VT: StackPtr.getValueType());
6432
6433 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
6434 };
6435
6436 if (!IsFastCall) {
6437 ComputePtrOff();
6438
6439 /* Compute GPR index associated with argument offset. */
6440 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
6441 GPR_idx = std::min(a: GPR_idx, b: NumGPRs);
6442 }
6443
6444 // Promote integers to 64-bit values.
6445 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
6446 // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
6447 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6448 Arg = DAG.getNode(Opcode: ExtOp, DL: dl, VT: MVT::i64, Operand: Arg);
6449 }
6450
6451 // FIXME memcpy is used way more than necessary. Correctness first.
6452 // Note: "by value" is code for passing a structure by value, not
6453 // basic types.
6454 if (Flags.isByVal()) {
6455 // Note: Size includes alignment padding, so
6456 // struct x { short a; char b; }
6457 // will have Size = 4. With #pragma pack(1), it will have Size = 3.
6458 // These are the proper values we need for right-justifying the
6459 // aggregate in a parameter register.
6460 unsigned Size = Flags.getByValSize();
6461
6462 // An empty aggregate parameter takes up no storage and no
6463 // registers.
6464 if (Size == 0)
6465 continue;
6466
6467 if (IsFastCall)
6468 ComputePtrOff();
6469
6470 // All aggregates smaller than 8 bytes must be passed right-justified.
6471 if (Size==1 || Size==2 || Size==4) {
6472 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
6473 if (GPR_idx != NumGPRs) {
6474 SDValue Load = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: PtrVT, Chain, Ptr: Arg,
6475 PtrInfo: MachinePointerInfo(), MemVT: VT);
6476 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6477 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6478
6479 ArgOffset += PtrByteSize;
6480 continue;
6481 }
6482 }
6483
6484 if (GPR_idx == NumGPRs && Size < 8) {
6485 SDValue AddPtr = PtrOff;
6486 if (!isLittleEndian) {
6487 SDValue Const = DAG.getConstant(Val: PtrByteSize - Size, DL: dl,
6488 VT: PtrOff.getValueType());
6489 AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: Const);
6490 }
6491 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff: AddPtr,
6492 CallSeqStart,
6493 Flags, DAG, dl);
6494 ArgOffset += PtrByteSize;
6495 continue;
6496 }
6497 // Copy the object to parameter save area if it can not be entirely passed
6498 // by registers.
6499 // FIXME: we only need to copy the parts which need to be passed in
6500 // parameter save area. For the parts passed by registers, we don't need
6501 // to copy them to the stack although we need to allocate space for them
6502 // in parameter save area.
6503 if ((NumGPRs - GPR_idx) * PtrByteSize < Size)
6504 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
6505 CallSeqStart,
6506 Flags, DAG, dl);
6507
6508 // When a register is available, pass a small aggregate right-justified.
6509 if (Size < 8 && GPR_idx != NumGPRs) {
6510 // The easiest way to get this right-justified in a register
6511 // is to copy the structure into the rightmost portion of a
6512 // local variable slot, then load the whole slot into the
6513 // register.
6514 // FIXME: The memcpy seems to produce pretty awful code for
6515 // small aggregates, particularly for packed ones.
6516 // FIXME: It would be preferable to use the slot in the
6517 // parameter save area instead of a new local variable.
6518 SDValue AddPtr = PtrOff;
6519 if (!isLittleEndian) {
6520 SDValue Const = DAG.getConstant(Val: 8 - Size, DL: dl, VT: PtrOff.getValueType());
6521 AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: Const);
6522 }
6523 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff: AddPtr,
6524 CallSeqStart,
6525 Flags, DAG, dl);
6526
6527 // Load the slot into the register.
6528 SDValue Load =
6529 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6530 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6531 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6532
6533 // Done with this argument.
6534 ArgOffset += PtrByteSize;
6535 continue;
6536 }
6537
6538 // For aggregates larger than PtrByteSize, copy the pieces of the
6539 // object that fit into registers from the parameter save area.
6540 for (unsigned j=0; j<Size; j+=PtrByteSize) {
6541 SDValue Const = DAG.getConstant(Val: j, DL: dl, VT: PtrOff.getValueType());
6542 SDValue AddArg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg, N2: Const);
6543 if (GPR_idx != NumGPRs) {
6544 unsigned LoadSizeInBits = std::min(a: PtrByteSize, b: (Size - j)) * 8;
6545 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LoadSizeInBits);
6546 SDValue Load = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: PtrVT, Chain, Ptr: AddArg,
6547 PtrInfo: MachinePointerInfo(), MemVT: ObjType);
6548
6549 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6550 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6551 ArgOffset += PtrByteSize;
6552 } else {
6553 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
6554 break;
6555 }
6556 }
6557 continue;
6558 }
6559
6560 switch (Arg.getSimpleValueType().SimpleTy) {
6561 default: llvm_unreachable("Unexpected ValueType for argument!");
6562 case MVT::i1:
6563 case MVT::i32:
6564 case MVT::i64:
6565 if (Flags.isNest()) {
6566 // The 'nest' parameter, if any, is passed in R11.
6567 RegsToPass.push_back(Elt: std::make_pair(x: PPC::X11, y&: Arg));
6568 break;
6569 }
6570
6571 // These can be scalar arguments or elements of an integer array type
6572 // passed directly. Clang may use those instead of "byval" aggregate
6573 // types to avoid forcing arguments to memory unnecessarily.
6574 if (GPR_idx != NumGPRs) {
6575 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Arg));
6576 } else {
6577 if (IsFastCall)
6578 ComputePtrOff();
6579
6580 assert(HasParameterArea &&
6581 "Parameter area must exist to pass an argument in memory.");
6582 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6583 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: false, MemOpChains,
6584 TailCallArguments, dl);
6585 if (IsFastCall)
6586 ArgOffset += PtrByteSize;
6587 }
6588 if (!IsFastCall)
6589 ArgOffset += PtrByteSize;
6590 break;
6591 case MVT::f32:
6592 case MVT::f64: {
6593 // These can be scalar arguments or elements of a float array type
6594 // passed directly. The latter are used to implement ELFv2 homogenous
6595 // float aggregates.
6596
6597 // Named arguments go into FPRs first, and once they overflow, the
6598 // remaining arguments go into GPRs and then the parameter save area.
6599 // Unnamed arguments for vararg functions always go to GPRs and
6600 // then the parameter save area. For now, put all arguments to vararg
6601 // routines always in both locations (FPR *and* GPR or stack slot).
6602 bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
6603 bool NeededLoad = false;
6604
6605 // First load the argument into the next available FPR.
6606 if (FPR_idx != NumFPRs)
6607 RegsToPass.push_back(Elt: std::make_pair(x: FPR[FPR_idx++], y&: Arg));
6608
6609 // Next, load the argument into GPR or stack slot if needed.
6610 if (!NeedGPROrStack)
6611 ;
6612 else if (GPR_idx != NumGPRs && !IsFastCall) {
6613 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
6614 // once we support fp <-> gpr moves.
6615
6616 // In the non-vararg case, this can only ever happen in the
6617 // presence of f32 array types, since otherwise we never run
6618 // out of FPRs before running out of GPRs.
6619 SDValue ArgVal;
6620
6621 // Double values are always passed in a single GPR.
6622 if (Arg.getValueType() != MVT::f32) {
6623 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: Arg);
6624
6625 // Non-array float values are extended and passed in a GPR.
6626 } else if (!Flags.isInConsecutiveRegs()) {
6627 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6628 ArgVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: ArgVal);
6629
6630 // If we have an array of floats, we collect every odd element
6631 // together with its predecessor into one GPR.
6632 } else if (ArgOffset % PtrByteSize != 0) {
6633 SDValue Lo, Hi;
6634 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: OutVals[i - 1]);
6635 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6636 if (!isLittleEndian)
6637 std::swap(a&: Lo, b&: Hi);
6638 ArgVal = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Lo, N2: Hi);
6639
6640 // The final element, if even, goes into the first half of a GPR.
6641 } else if (Flags.isInConsecutiveRegsLast()) {
6642 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6643 ArgVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: ArgVal);
6644 if (!isLittleEndian)
6645 ArgVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: ArgVal,
6646 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
6647
6648 // Non-final even elements are skipped; they will be handled
6649 // together the with subsequent argument on the next go-around.
6650 } else
6651 ArgVal = SDValue();
6652
6653 if (ArgVal.getNode())
6654 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: ArgVal));
6655 } else {
6656 if (IsFastCall)
6657 ComputePtrOff();
6658
6659 // Single-precision floating-point values are mapped to the
6660 // second (rightmost) word of the stack doubleword.
6661 if (Arg.getValueType() == MVT::f32 &&
6662 !isLittleEndian && !Flags.isInConsecutiveRegs()) {
6663 SDValue ConstFour = DAG.getConstant(Val: 4, DL: dl, VT: PtrOff.getValueType());
6664 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: ConstFour);
6665 }
6666
6667 assert(HasParameterArea &&
6668 "Parameter area must exist to pass an argument in memory.");
6669 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6670 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: false, MemOpChains,
6671 TailCallArguments, dl);
6672
6673 NeededLoad = true;
6674 }
6675 // When passing an array of floats, the array occupies consecutive
6676 // space in the argument area; only round up to the next doubleword
6677 // at the end of the array. Otherwise, each float takes 8 bytes.
6678 if (!IsFastCall || NeededLoad) {
6679 ArgOffset += (Arg.getValueType() == MVT::f32 &&
6680 Flags.isInConsecutiveRegs()) ? 4 : 8;
6681 if (Flags.isInConsecutiveRegsLast())
6682 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6683 }
6684 break;
6685 }
6686 case MVT::v4f32:
6687 case MVT::v4i32:
6688 case MVT::v8i16:
6689 case MVT::v16i8:
6690 case MVT::v2f64:
6691 case MVT::v2i64:
6692 case MVT::v1i128:
6693 case MVT::f128:
6694 // These can be scalar arguments or elements of a vector array type
6695 // passed directly. The latter are used to implement ELFv2 homogenous
6696 // vector aggregates.
6697
6698 // For a varargs call, named arguments go into VRs or on the stack as
6699 // usual; unnamed arguments always go to the stack or the corresponding
6700 // GPRs when within range. For now, we always put the value in both
6701 // locations (or even all three).
6702 if (CFlags.IsVarArg) {
6703 assert(HasParameterArea &&
6704 "Parameter area must exist if we have a varargs call.");
6705 // We could elide this store in the case where the object fits
6706 // entirely in R registers. Maybe later.
6707 SDValue Store =
6708 DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6709 MemOpChains.push_back(Elt: Store);
6710 if (VR_idx != NumVRs) {
6711 SDValue Load =
6712 DAG.getLoad(VT: MVT::v4f32, dl, Chain: Store, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6713 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6714 RegsToPass.push_back(Elt: std::make_pair(x: VR[VR_idx++], y&: Load));
6715 }
6716 ArgOffset += 16;
6717 for (unsigned i=0; i<16; i+=PtrByteSize) {
6718 if (GPR_idx == NumGPRs)
6719 break;
6720 SDValue Ix = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff,
6721 N2: DAG.getConstant(Val: i, DL: dl, VT: PtrVT));
6722 SDValue Load =
6723 DAG.getLoad(VT: PtrVT, dl, Chain: Store, Ptr: Ix, PtrInfo: MachinePointerInfo());
6724 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6725 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6726 }
6727 break;
6728 }
6729
6730 // Non-varargs Altivec params go into VRs or on the stack.
6731 if (VR_idx != NumVRs) {
6732 RegsToPass.push_back(Elt: std::make_pair(x: VR[VR_idx++], y&: Arg));
6733 } else {
6734 if (IsFastCall)
6735 ComputePtrOff();
6736
6737 assert(HasParameterArea &&
6738 "Parameter area must exist to pass an argument in memory.");
6739 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6740 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: true, MemOpChains,
6741 TailCallArguments, dl);
6742 if (IsFastCall)
6743 ArgOffset += 16;
6744 }
6745
6746 if (!IsFastCall)
6747 ArgOffset += 16;
6748 break;
6749 }
6750 }
6751
6752 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
6753 "mismatch in size of parameter area");
6754 (void)NumBytesActuallyUsed;
6755
6756 if (!MemOpChains.empty())
6757 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
6758
6759 // Check if this is an indirect call (MTCTR/BCTRL).
6760 // See prepareDescriptorIndirectCall and buildCallOperands for more
6761 // information about calls through function pointers in the 64-bit SVR4 ABI.
6762 if (CFlags.IsIndirect) {
6763 // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
6764 // caller in the TOC save area.
6765 if (isTOCSaveRestoreRequired(Subtarget)) {
6766 assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
6767 // Load r2 into a virtual register and store it to the TOC save area.
6768 setUsesTOCBasePtr(DAG);
6769 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: PPC::X2, VT: MVT::i64);
6770 // TOC save area offset.
6771 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
6772 SDValue PtrOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
6773 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
6774 Chain = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: AddPtr,
6775 PtrInfo: MachinePointerInfo::getStack(
6776 MF&: DAG.getMachineFunction(), Offset: TOCSaveOffset));
6777 }
6778 // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
6779 // This does not mean the MTCTR instruction must use R12; it's easier
6780 // to model this as an extra parameter, so do that.
6781 if (isELFv2ABI && !CFlags.IsPatchPoint)
6782 RegsToPass.push_back(Elt: std::make_pair(x: (unsigned)PPC::X12, y&: Callee));
6783 }
6784
6785 // Build a sequence of copy-to-reg nodes chained together with token chain
6786 // and flag operands which copy the outgoing args into the appropriate regs.
6787 SDValue InGlue;
6788 for (const auto &[Reg, N] : RegsToPass) {
6789 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
6790 InGlue = Chain.getValue(R: 1);
6791 }
6792
6793 if (CFlags.IsTailCall && !IsSibCall)
6794 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6795 TailCallArguments);
6796
6797 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
6798 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6799}
6800
6801// Returns true when the shadow of a general purpose argument register
6802// in the parameter save area is aligned to at least 'RequiredAlign'.
6803static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
6804 assert(RequiredAlign.value() <= 16 &&
6805 "Required alignment greater than stack alignment.");
6806 switch (Reg) {
6807 default:
6808 report_fatal_error(reason: "called on invalid register.");
6809 case PPC::R5:
6810 case PPC::R9:
6811 case PPC::X3:
6812 case PPC::X5:
6813 case PPC::X7:
6814 case PPC::X9:
6815 // These registers are 16 byte aligned which is the most strict aligment
6816 // we can support.
6817 return true;
6818 case PPC::R3:
6819 case PPC::R7:
6820 case PPC::X4:
6821 case PPC::X6:
6822 case PPC::X8:
6823 case PPC::X10:
6824 // The shadow of these registers in the PSA is 8 byte aligned.
6825 return RequiredAlign <= 8;
6826 case PPC::R4:
6827 case PPC::R6:
6828 case PPC::R8:
6829 case PPC::R10:
6830 return RequiredAlign <= 4;
6831 }
6832}
6833
6834static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
6835 CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
6836 Type *OrigTy, CCState &State) {
6837 const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
6838 State.getMachineFunction().getSubtarget());
6839 const bool IsPPC64 = Subtarget.isPPC64();
6840 const unsigned PtrSize = IsPPC64 ? 8 : 4;
6841 const Align PtrAlign(PtrSize);
6842 const Align StackAlign(16);
6843 const MVT RegVT = Subtarget.getScalarIntVT();
6844
6845 if (ValVT == MVT::f128)
6846 report_fatal_error(reason: "f128 is unimplemented on AIX.");
6847
6848 static const MCPhysReg GPR_32[] = {// 32-bit registers.
6849 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
6850 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
6851 static const MCPhysReg GPR_64[] = {// 64-bit registers.
6852 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6853 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
6854
6855 static const MCPhysReg VR[] = {// Vector registers.
6856 PPC::V2, PPC::V3, PPC::V4, PPC::V5,
6857 PPC::V6, PPC::V7, PPC::V8, PPC::V9,
6858 PPC::V10, PPC::V11, PPC::V12, PPC::V13};
6859
6860 const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
6861
6862 if (ArgFlags.isNest()) {
6863 MCRegister EnvReg = State.AllocateReg(Reg: IsPPC64 ? PPC::X11 : PPC::R11);
6864 if (!EnvReg)
6865 report_fatal_error(reason: "More then one nest argument.");
6866 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: EnvReg, LocVT: RegVT, HTP: LocInfo));
6867 return false;
6868 }
6869
6870 if (ArgFlags.isByVal()) {
6871 const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
6872 if (ByValAlign > StackAlign)
6873 report_fatal_error(reason: "Pass-by-value arguments with alignment greater than "
6874 "16 are not supported.");
6875
6876 const unsigned ByValSize = ArgFlags.getByValSize();
6877 const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;
6878
6879 // An empty aggregate parameter takes up no storage and no registers,
6880 // but needs a MemLoc for a stack slot for the formal arguments side.
6881 if (ByValSize == 0) {
6882 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6883 Offset: State.getStackSize(), LocVT: RegVT, HTP: LocInfo));
6884 return false;
6885 }
6886
6887 // Shadow allocate any registers that are not properly aligned.
6888 unsigned NextReg = State.getFirstUnallocated(Regs: GPRs);
6889 while (NextReg != GPRs.size() &&
6890 !isGPRShadowAligned(Reg: GPRs[NextReg], RequiredAlign: ObjAlign)) {
6891 // Shadow allocate next registers since its aligment is not strict enough.
6892 MCRegister Reg = State.AllocateReg(Regs: GPRs);
6893 // Allocate the stack space shadowed by said register.
6894 State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6895 assert(Reg && "Alocating register unexpectedly failed.");
6896 (void)Reg;
6897 NextReg = State.getFirstUnallocated(Regs: GPRs);
6898 }
6899
6900 const unsigned StackSize = alignTo(Size: ByValSize, A: ObjAlign);
6901 unsigned Offset = State.AllocateStack(Size: StackSize, Alignment: ObjAlign);
6902 for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
6903 if (MCRegister Reg = State.AllocateReg(Regs: GPRs))
6904 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6905 else {
6906 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6907 Offset, LocVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6908 HTP: LocInfo));
6909 break;
6910 }
6911 }
6912 return false;
6913 }
6914
6915 // Arguments always reserve parameter save area.
6916 switch (ValVT.SimpleTy) {
6917 default:
6918 report_fatal_error(reason: "Unhandled value type for argument.");
6919 case MVT::i64:
6920 // i64 arguments should have been split to i32 for PPC32.
6921 assert(IsPPC64 && "PPC32 should have split i64 values.");
6922 [[fallthrough]];
6923 case MVT::i1:
6924 case MVT::i32: {
6925 const unsigned Offset = State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6926 // AIX integer arguments are always passed in register width.
6927 if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
6928 LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
6929 : CCValAssign::LocInfo::ZExt;
6930 if (MCRegister Reg = State.AllocateReg(Regs: GPRs))
6931 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6932 else
6933 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT: RegVT, HTP: LocInfo));
6934
6935 return false;
6936 }
6937 case MVT::f32:
6938 case MVT::f64: {
6939 // Parameter save area (PSA) is reserved even if the float passes in fpr.
6940 const unsigned StoreSize = LocVT.getStoreSize();
6941 // Floats are always 4-byte aligned in the PSA on AIX.
6942 // This includes f64 in 64-bit mode for ABI compatibility.
6943 const unsigned Offset =
6944 State.AllocateStack(Size: IsPPC64 ? 8 : StoreSize, Alignment: Align(4));
6945 MCRegister FReg = State.AllocateReg(Regs: FPR);
6946 if (FReg)
6947 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: FReg, LocVT, HTP: LocInfo));
6948
6949 // Reserve and initialize GPRs or initialize the PSA as required.
6950 for (unsigned I = 0; I < StoreSize; I += PtrSize) {
6951 if (MCRegister Reg = State.AllocateReg(Regs: GPRs)) {
6952 assert(FReg && "An FPR should be available when a GPR is reserved.");
6953 if (State.isVarArg()) {
6954 // Successfully reserved GPRs are only initialized for vararg calls.
6955 // Custom handling is required for:
6956 // f64 in PPC32 needs to be split into 2 GPRs.
6957 // f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
6958 State.addLoc(
6959 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6960 }
6961 } else {
6962 // If there are insufficient GPRs, the PSA needs to be initialized.
6963 // Initialization occurs even if an FPR was initialized for
6964 // compatibility with the AIX XL compiler. The full memory for the
6965 // argument will be initialized even if a prior word is saved in GPR.
6966 // A custom memLoc is used when the argument also passes in FPR so
6967 // that the callee handling can skip over it easily.
6968 State.addLoc(
6969 V: FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
6970 HTP: LocInfo)
6971 : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6972 break;
6973 }
6974 }
6975
6976 return false;
6977 }
6978 case MVT::v4f32:
6979 case MVT::v4i32:
6980 case MVT::v8i16:
6981 case MVT::v16i8:
6982 case MVT::v2i64:
6983 case MVT::v2f64:
6984 case MVT::v1i128: {
6985 const unsigned VecSize = 16;
6986 const Align VecAlign(VecSize);
6987
6988 if (!State.isVarArg()) {
6989 // If there are vector registers remaining we don't consume any stack
6990 // space.
6991 if (MCRegister VReg = State.AllocateReg(Regs: VR)) {
6992 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: VReg, LocVT, HTP: LocInfo));
6993 return false;
6994 }
6995 // Vectors passed on the stack do not shadow GPRs or FPRs even though they
6996 // might be allocated in the portion of the PSA that is shadowed by the
6997 // GPRs.
6998 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6999 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
7000 return false;
7001 }
7002
7003 unsigned NextRegIndex = State.getFirstUnallocated(Regs: GPRs);
7004 // Burn any underaligned registers and their shadowed stack space until
7005 // we reach the required alignment.
7006 while (NextRegIndex != GPRs.size() &&
7007 !isGPRShadowAligned(Reg: GPRs[NextRegIndex], RequiredAlign: VecAlign)) {
7008 // Shadow allocate register and its stack shadow.
7009 MCRegister Reg = State.AllocateReg(Regs: GPRs);
7010 State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
7011 assert(Reg && "Allocating register unexpectedly failed.");
7012 (void)Reg;
7013 NextRegIndex = State.getFirstUnallocated(Regs: GPRs);
7014 }
7015
7016 // Vectors that are passed as fixed arguments are handled differently.
7017 // They are passed in VRs if any are available (unlike arguments passed
7018 // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
7019 // functions)
7020 if (!ArgFlags.isVarArg()) {
7021 if (MCRegister VReg = State.AllocateReg(Regs: VR)) {
7022 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: VReg, LocVT, HTP: LocInfo));
7023 // Shadow allocate GPRs and stack space even though we pass in a VR.
7024 for (unsigned I = 0; I != VecSize; I += PtrSize)
7025 State.AllocateReg(Regs: GPRs);
7026 State.AllocateStack(Size: VecSize, Alignment: VecAlign);
7027 return false;
7028 }
7029 // No vector registers remain so pass on the stack.
7030 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
7031 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
7032 return false;
7033 }
7034
7035 // If all GPRS are consumed then we pass the argument fully on the stack.
7036 if (NextRegIndex == GPRs.size()) {
7037 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
7038 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
7039 return false;
7040 }
7041
7042 // Corner case for 32-bit codegen. We have 2 registers to pass the first
7043 // half of the argument, and then need to pass the remaining half on the
7044 // stack.
7045 if (GPRs[NextRegIndex] == PPC::R9) {
7046 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
7047 State.addLoc(
7048 V: CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
7049
7050 const MCRegister FirstReg = State.AllocateReg(Reg: PPC::R9);
7051 const MCRegister SecondReg = State.AllocateReg(Reg: PPC::R10);
7052 assert(FirstReg && SecondReg &&
7053 "Allocating R9 or R10 unexpectedly failed.");
7054 State.addLoc(
7055 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg: FirstReg, LocVT: RegVT, HTP: LocInfo));
7056 State.addLoc(
7057 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg: SecondReg, LocVT: RegVT, HTP: LocInfo));
7058 return false;
7059 }
7060
7061 // We have enough GPRs to fully pass the vector argument, and we have
7062 // already consumed any underaligned registers. Start with the custom
7063 // MemLoc and then the custom RegLocs.
7064 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
7065 State.addLoc(
7066 V: CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
7067 for (unsigned I = 0; I != VecSize; I += PtrSize) {
7068 const MCRegister Reg = State.AllocateReg(Regs: GPRs);
7069 assert(Reg && "Failed to allocated register for vararg vector argument");
7070 State.addLoc(
7071 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
7072 }
7073 return false;
7074 }
7075 }
7076 return true;
7077}
7078
7079// So far, this function is only used by LowerFormalArguments_AIX()
7080static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
7081 bool IsPPC64,
7082 bool HasP8Vector,
7083 bool HasVSX) {
7084 assert((IsPPC64 || SVT != MVT::i64) &&
7085 "i64 should have been split for 32-bit codegen.");
7086
7087 switch (SVT) {
7088 default:
7089 report_fatal_error(reason: "Unexpected value type for formal argument");
7090 case MVT::i1:
7091 case MVT::i32:
7092 case MVT::i64:
7093 return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7094 case MVT::f32:
7095 return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;
7096 case MVT::f64:
7097 return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;
7098 case MVT::v4f32:
7099 case MVT::v4i32:
7100 case MVT::v8i16:
7101 case MVT::v16i8:
7102 case MVT::v2i64:
7103 case MVT::v2f64:
7104 case MVT::v1i128:
7105 return &PPC::VRRCRegClass;
7106 }
7107}
7108
7109static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
7110 SelectionDAG &DAG, SDValue ArgValue,
7111 MVT LocVT, const SDLoc &dl) {
7112 assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
7113 assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
7114
7115 if (Flags.isSExt())
7116 ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: LocVT, N1: ArgValue,
7117 N2: DAG.getValueType(ValVT));
7118 else if (Flags.isZExt())
7119 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: LocVT, N1: ArgValue,
7120 N2: DAG.getValueType(ValVT));
7121
7122 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ValVT, Operand: ArgValue);
7123}
7124
7125static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
7126 const unsigned LASize = FL->getLinkageSize();
7127
7128 if (PPC::GPRCRegClass.contains(Reg)) {
7129 assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
7130 "Reg must be a valid argument register!");
7131 return LASize + 4 * (Reg - PPC::R3);
7132 }
7133
7134 if (PPC::G8RCRegClass.contains(Reg)) {
7135 assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
7136 "Reg must be a valid argument register!");
7137 return LASize + 8 * (Reg - PPC::X3);
7138 }
7139
7140 llvm_unreachable("Only general purpose registers expected.");
7141}
7142
7143// AIX ABI Stack Frame Layout:
7144//
7145// Low Memory +--------------------------------------------+
7146// SP +---> | Back chain | ---+
7147// | +--------------------------------------------+ |
7148// | | Saved Condition Register | |
7149// | +--------------------------------------------+ |
7150// | | Saved Linkage Register | |
7151// | +--------------------------------------------+ | Linkage Area
7152// | | Reserved for compilers | |
7153// | +--------------------------------------------+ |
7154// | | Reserved for binders | |
7155// | +--------------------------------------------+ |
7156// | | Saved TOC pointer | ---+
7157// | +--------------------------------------------+
7158// | | Parameter save area |
7159// | +--------------------------------------------+
7160// | | Alloca space |
7161// | +--------------------------------------------+
7162// | | Local variable space |
7163// | +--------------------------------------------+
7164// | | Float/int conversion temporary |
7165// | +--------------------------------------------+
7166// | | Save area for AltiVec registers |
7167// | +--------------------------------------------+
7168// | | AltiVec alignment padding |
7169// | +--------------------------------------------+
7170// | | Save area for VRSAVE register |
7171// | +--------------------------------------------+
7172// | | Save area for General Purpose registers |
7173// | +--------------------------------------------+
7174// | | Save area for Floating Point registers |
7175// | +--------------------------------------------+
7176// +---- | Back chain |
7177// High Memory +--------------------------------------------+
7178//
7179// Specifications:
7180// AIX 7.2 Assembler Language Reference
7181// Subroutine linkage convention
7182
7183SDValue PPCTargetLowering::LowerFormalArguments_AIX(
7184 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7185 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7186 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7187
7188 assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
7189 CallConv == CallingConv::Fast) &&
7190 "Unexpected calling convention!");
7191
7192 if (getTargetMachine().Options.GuaranteedTailCallOpt)
7193 report_fatal_error(reason: "Tail call support is unimplemented on AIX.");
7194
7195 if (useSoftFloat())
7196 report_fatal_error(reason: "Soft float support is unimplemented on AIX.");
7197
7198 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7199
7200 const bool IsPPC64 = Subtarget.isPPC64();
7201 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7202
7203 // Assign locations to all of the incoming arguments.
7204 SmallVector<CCValAssign, 16> ArgLocs;
7205 MachineFunction &MF = DAG.getMachineFunction();
7206 MachineFrameInfo &MFI = MF.getFrameInfo();
7207 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
7208 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7209
7210 const EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7211 // Reserve space for the linkage area on the stack.
7212 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7213 CCInfo.AllocateStack(Size: LinkageSize, Alignment: Align(PtrByteSize));
7214 uint64_t SaveStackPos = CCInfo.getStackSize();
7215 bool SaveParams = MF.getFunction().hasFnAttribute(Kind: "save-reg-params");
7216 CCInfo.AnalyzeFormalArguments(Ins, Fn: CC_AIX);
7217
7218 SmallVector<SDValue, 8> MemOps;
7219
7220 for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
7221 CCValAssign &VA = ArgLocs[I++];
7222 MVT LocVT = VA.getLocVT();
7223 MVT ValVT = VA.getValVT();
7224 ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
7225
7226 EVT ArgVT = Ins[VA.getValNo()].ArgVT;
7227 bool ArgSignExt = Ins[VA.getValNo()].Flags.isSExt();
7228 // For compatibility with the AIX XL compiler, the float args in the
7229 // parameter save area are initialized even if the argument is available
7230 // in register. The caller is required to initialize both the register
7231 // and memory, however, the callee can choose to expect it in either.
7232 // The memloc is dismissed here because the argument is retrieved from
7233 // the register.
7234 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
7235 continue;
7236
7237 if (SaveParams && VA.isRegLoc() && !Flags.isByVal() && !VA.needsCustom()) {
7238 const TargetRegisterClass *RegClass = getRegClassForSVT(
7239 SVT: LocVT.SimpleTy, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(), HasVSX: Subtarget.hasVSX());
7240 // On PPC64, debugger assumes extended 8-byte values are stored from GPR.
7241 MVT SaveVT = RegClass == &PPC::G8RCRegClass ? MVT::i64 : LocVT;
7242 const Register VReg = MF.addLiveIn(PReg: VA.getLocReg(), RC: RegClass);
7243 SDValue Parm = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: SaveVT);
7244 int FI = MFI.CreateFixedObject(Size: SaveVT.getStoreSize(), SPOffset: SaveStackPos, IsImmutable: true);
7245 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7246 SDValue StoreReg = DAG.getStore(Chain, dl, Val: Parm, Ptr: FIN,
7247 PtrInfo: MachinePointerInfo(), Alignment: Align(PtrByteSize));
7248 SaveStackPos = alignTo(Value: SaveStackPos + SaveVT.getStoreSize(), Align: PtrByteSize);
7249 MemOps.push_back(Elt: StoreReg);
7250 }
7251
7252 if (SaveParams && (VA.isMemLoc() || Flags.isByVal()) && !VA.needsCustom()) {
7253 unsigned StoreSize =
7254 Flags.isByVal() ? Flags.getByValSize() : LocVT.getStoreSize();
7255 SaveStackPos = alignTo(Value: SaveStackPos + StoreSize, Align: PtrByteSize);
7256 }
7257
7258 auto HandleMemLoc = [&]() {
7259 const unsigned LocSize = LocVT.getStoreSize();
7260 const unsigned ValSize = ValVT.getStoreSize();
7261 assert((ValSize <= LocSize) &&
7262 "Object size is larger than size of MemLoc");
7263 int CurArgOffset = VA.getLocMemOffset();
7264 // Objects are right-justified because AIX is big-endian.
7265 if (LocSize > ValSize)
7266 CurArgOffset += LocSize - ValSize;
7267 // Potential tail calls could cause overwriting of argument stack slots.
7268 const bool IsImmutable =
7269 !(getTargetMachine().Options.GuaranteedTailCallOpt &&
7270 (CallConv == CallingConv::Fast));
7271 int FI = MFI.CreateFixedObject(Size: ValSize, SPOffset: CurArgOffset, IsImmutable);
7272 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7273 SDValue ArgValue =
7274 DAG.getLoad(VT: ValVT, dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo());
7275
7276 // While the ABI specifies the argument type is (sign or zero) extended
7277 // out to register width, not all code is compliant. We truncate and
7278 // re-extend to be more forgiving of these callers when the argument type
7279 // is smaller than register width.
7280 if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
7281 ValVT.isInteger() &&
7282 ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
7283 // It is possible to have either real integer values
7284 // or integers that were not originally integers.
7285 // In the latter case, these could have came from structs,
7286 // and these integers would not have an extend on the parameter.
7287 // Since these types of integers do not have an extend specified
7288 // in the first place, the type of extend that we do should not matter.
7289 EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1
7290 ? MVT::i8
7291 : ArgVT;
7292 SDValue ArgValueTrunc =
7293 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: TruncatedArgVT, Operand: ArgValue);
7294 SDValue ArgValueExt =
7295 ArgSignExt ? DAG.getSExtOrTrunc(Op: ArgValueTrunc, DL: dl, VT: ValVT)
7296 : DAG.getZExtOrTrunc(Op: ArgValueTrunc, DL: dl, VT: ValVT);
7297 InVals.push_back(Elt: ArgValueExt);
7298 } else {
7299 InVals.push_back(Elt: ArgValue);
7300 }
7301 };
7302
7303 // Vector arguments to VaArg functions are passed both on the stack, and
7304 // in any available GPRs. Load the value from the stack and add the GPRs
7305 // as live ins.
7306 if (VA.isMemLoc() && VA.needsCustom()) {
7307 assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");
7308 assert(isVarArg && "Only use custom memloc for vararg.");
7309 // ValNo of the custom MemLoc, so we can compare it to the ValNo of the
7310 // matching custom RegLocs.
7311 const unsigned OriginalValNo = VA.getValNo();
7312 (void)OriginalValNo;
7313
7314 auto HandleCustomVecRegLoc = [&]() {
7315 assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7316 "Missing custom RegLoc.");
7317 VA = ArgLocs[I++];
7318 assert(VA.getValVT().isVector() &&
7319 "Unexpected Val type for custom RegLoc.");
7320 assert(VA.getValNo() == OriginalValNo &&
7321 "ValNo mismatch between custom MemLoc and RegLoc.");
7322 MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;
7323 MF.addLiveIn(PReg: VA.getLocReg(),
7324 RC: getRegClassForSVT(SVT, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(),
7325 HasVSX: Subtarget.hasVSX()));
7326 };
7327
7328 HandleMemLoc();
7329 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7330 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7331 // R10.
7332 HandleCustomVecRegLoc();
7333 HandleCustomVecRegLoc();
7334
7335 // If we are targeting 32-bit, there might be 2 extra custom RegLocs if
7336 // we passed the vector in R5, R6, R7 and R8.
7337 if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {
7338 assert(!IsPPC64 &&
7339 "Only 2 custom RegLocs expected for 64-bit codegen.");
7340 HandleCustomVecRegLoc();
7341 HandleCustomVecRegLoc();
7342 }
7343
7344 continue;
7345 }
7346
7347 if (VA.isRegLoc()) {
7348 if (VA.getValVT().isScalarInteger())
7349 FuncInfo->appendParameterType(Type: PPCFunctionInfo::FixedType);
7350 else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {
7351 switch (VA.getValVT().SimpleTy) {
7352 default:
7353 report_fatal_error(reason: "Unhandled value type for argument.");
7354 case MVT::f32:
7355 FuncInfo->appendParameterType(Type: PPCFunctionInfo::ShortFloatingPoint);
7356 break;
7357 case MVT::f64:
7358 FuncInfo->appendParameterType(Type: PPCFunctionInfo::LongFloatingPoint);
7359 break;
7360 }
7361 } else if (VA.getValVT().isVector()) {
7362 switch (VA.getValVT().SimpleTy) {
7363 default:
7364 report_fatal_error(reason: "Unhandled value type for argument.");
7365 case MVT::v16i8:
7366 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorChar);
7367 break;
7368 case MVT::v8i16:
7369 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorShort);
7370 break;
7371 case MVT::v4i32:
7372 case MVT::v2i64:
7373 case MVT::v1i128:
7374 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorInt);
7375 break;
7376 case MVT::v4f32:
7377 case MVT::v2f64:
7378 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorFloat);
7379 break;
7380 }
7381 }
7382 }
7383
7384 if (Flags.isByVal() && VA.isMemLoc()) {
7385 const unsigned Size =
7386 alignTo(Value: Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
7387 Align: PtrByteSize);
7388 const int FI = MF.getFrameInfo().CreateFixedObject(
7389 Size, SPOffset: VA.getLocMemOffset(), /* IsImmutable */ false,
7390 /* IsAliased */ isAliased: true);
7391 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7392 InVals.push_back(Elt: FIN);
7393
7394 continue;
7395 }
7396
7397 if (Flags.isByVal()) {
7398 assert(VA.isRegLoc() && "MemLocs should already be handled.");
7399
7400 const MCPhysReg ArgReg = VA.getLocReg();
7401 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
7402
7403 const unsigned StackSize = alignTo(Value: Flags.getByValSize(), Align: PtrByteSize);
7404 const int FI = MF.getFrameInfo().CreateFixedObject(
7405 Size: StackSize, SPOffset: mapArgRegToOffsetAIX(Reg: ArgReg, FL), /* IsImmutable */ false,
7406 /* IsAliased */ isAliased: true);
7407 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7408 InVals.push_back(Elt: FIN);
7409
7410 // Add live ins for all the RegLocs for the same ByVal.
7411 const TargetRegisterClass *RegClass =
7412 IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7413
7414 auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
7415 unsigned Offset) {
7416 const Register VReg = MF.addLiveIn(PReg: PhysReg, RC: RegClass);
7417 // Since the callers side has left justified the aggregate in the
7418 // register, we can simply store the entire register into the stack
7419 // slot.
7420 SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: LocVT);
7421 // The store to the fixedstack object is needed becuase accessing a
7422 // field of the ByVal will use a gep and load. Ideally we will optimize
7423 // to extracting the value from the register directly, and elide the
7424 // stores when the arguments address is not taken, but that will need to
7425 // be future work.
7426 SDValue Store = DAG.getStore(
7427 Chain: CopyFrom.getValue(R: 1), dl, Val: CopyFrom,
7428 Ptr: DAG.getObjectPtrOffset(SL: dl, Ptr: FIN, Offset: TypeSize::getFixed(ExactSize: Offset)),
7429 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI, Offset));
7430
7431 MemOps.push_back(Elt: Store);
7432 };
7433
7434 unsigned Offset = 0;
7435 HandleRegLoc(VA.getLocReg(), Offset);
7436 Offset += PtrByteSize;
7437 for (; Offset != StackSize && ArgLocs[I].isRegLoc();
7438 Offset += PtrByteSize) {
7439 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7440 "RegLocs should be for ByVal argument.");
7441
7442 const CCValAssign RL = ArgLocs[I++];
7443 HandleRegLoc(RL.getLocReg(), Offset);
7444 FuncInfo->appendParameterType(Type: PPCFunctionInfo::FixedType);
7445 }
7446
7447 if (Offset != StackSize) {
7448 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7449 "Expected MemLoc for remaining bytes.");
7450 assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
7451 // Consume the MemLoc.The InVal has already been emitted, so nothing
7452 // more needs to be done.
7453 ++I;
7454 }
7455
7456 continue;
7457 }
7458
7459 if (VA.isRegLoc() && !VA.needsCustom()) {
7460 MVT::SimpleValueType SVT = ValVT.SimpleTy;
7461 Register VReg =
7462 MF.addLiveIn(PReg: VA.getLocReg(),
7463 RC: getRegClassForSVT(SVT, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(),
7464 HasVSX: Subtarget.hasVSX()));
7465 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: LocVT);
7466 if (ValVT.isScalarInteger() &&
7467 (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
7468 ArgValue =
7469 truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
7470 }
7471 InVals.push_back(Elt: ArgValue);
7472 continue;
7473 }
7474 if (VA.isMemLoc()) {
7475 HandleMemLoc();
7476 continue;
7477 }
7478 }
7479
7480 // On AIX a minimum of 8 words is saved to the parameter save area.
7481 const unsigned MinParameterSaveArea = 8 * PtrByteSize;
7482 // Area that is at least reserved in the caller of this function.
7483 unsigned CallerReservedArea = std::max<unsigned>(
7484 a: CCInfo.getStackSize(), b: LinkageSize + MinParameterSaveArea);
7485
7486 // Set the size that is at least reserved in caller of this function. Tail
7487 // call optimized function's reserved stack space needs to be aligned so
7488 // that taking the difference between two stack areas will result in an
7489 // aligned stack.
7490 CallerReservedArea =
7491 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: CallerReservedArea);
7492 FuncInfo->setMinReservedArea(CallerReservedArea);
7493
7494 if (isVarArg) {
7495 int VAListIndex = 0;
7496 // If any of the optional arguments are passed in register then the fixed
7497 // stack object we spill into is not immutable. Create a fixed stack object
7498 // that overlaps the remainder of the parameter save area.
7499 if (CCInfo.getStackSize() < (LinkageSize + MinParameterSaveArea)) {
7500 unsigned FixedStackSize =
7501 LinkageSize + MinParameterSaveArea - CCInfo.getStackSize();
7502 VAListIndex =
7503 MFI.CreateFixedObject(Size: FixedStackSize, SPOffset: CCInfo.getStackSize(),
7504 /* IsImmutable */ false, /* IsAliased */ isAliased: true);
7505 } else {
7506 // All the arguments passed through ellipses are on the stack. Create a
7507 // dummy fixed stack object the same size as a pointer since we don't
7508 // know the actual size.
7509 VAListIndex =
7510 MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: CCInfo.getStackSize(),
7511 /* IsImmutable */ true, /* IsAliased */ isAliased: true);
7512 }
7513
7514 FuncInfo->setVarArgsFrameIndex(VAListIndex);
7515 SDValue FIN = DAG.getFrameIndex(FI: VAListIndex, VT: PtrVT);
7516
7517 static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
7518 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
7519
7520 static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
7521 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
7522 const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32);
7523
7524 // The fixed integer arguments of a variadic function are stored to the
7525 // VarArgsFrameIndex on the stack so that they may be loaded by
7526 // dereferencing the result of va_next.
7527 for (unsigned
7528 GPRIndex = (CCInfo.getStackSize() - LinkageSize) / PtrByteSize,
7529 Offset = 0;
7530 GPRIndex < NumGPArgRegs; ++GPRIndex, Offset += PtrByteSize) {
7531
7532 const Register VReg =
7533 IsPPC64 ? MF.addLiveIn(PReg: GPR_64[GPRIndex], RC: &PPC::G8RCRegClass)
7534 : MF.addLiveIn(PReg: GPR_32[GPRIndex], RC: &PPC::GPRCRegClass);
7535
7536 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
7537 MachinePointerInfo MPI =
7538 MachinePointerInfo::getFixedStack(MF, FI: VAListIndex, Offset);
7539 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MPI);
7540 MemOps.push_back(Elt: Store);
7541 // Increment the address for the next argument to store.
7542 SDValue PtrOff = DAG.getConstant(Val: PtrByteSize, DL: dl, VT: PtrVT);
7543 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
7544 }
7545 }
7546
7547 if (!MemOps.empty())
7548 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
7549
7550 return Chain;
7551}
7552
7553SDValue PPCTargetLowering::LowerCall_AIX(
7554 SDValue Chain, SDValue Callee, CallFlags CFlags,
7555 const SmallVectorImpl<ISD::OutputArg> &Outs,
7556 const SmallVectorImpl<SDValue> &OutVals,
7557 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7558 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
7559 const CallBase *CB) const {
7560 // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
7561 // AIX ABI stack frame layout.
7562
7563 assert((CFlags.CallConv == CallingConv::C ||
7564 CFlags.CallConv == CallingConv::Cold ||
7565 CFlags.CallConv == CallingConv::Fast) &&
7566 "Unexpected calling convention!");
7567
7568 if (CFlags.IsPatchPoint)
7569 report_fatal_error(reason: "This call type is unimplemented on AIX.");
7570
7571 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7572
7573 MachineFunction &MF = DAG.getMachineFunction();
7574 SmallVector<CCValAssign, 16> ArgLocs;
7575 CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
7576 *DAG.getContext());
7577
7578 // Reserve space for the linkage save area (LSA) on the stack.
7579 // In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
7580 // [SP][CR][LR][2 x reserved][TOC].
7581 // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
7582 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7583 const bool IsPPC64 = Subtarget.isPPC64();
7584 const EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7585 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7586 CCInfo.AllocateStack(Size: LinkageSize, Alignment: Align(PtrByteSize));
7587 CCInfo.AnalyzeCallOperands(Outs, Fn: CC_AIX);
7588
7589 // The prolog code of the callee may store up to 8 GPR argument registers to
7590 // the stack, allowing va_start to index over them in memory if the callee
7591 // is variadic.
7592 // Because we cannot tell if this is needed on the caller side, we have to
7593 // conservatively assume that it is needed. As such, make sure we have at
7594 // least enough stack space for the caller to store the 8 GPRs.
7595 const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
7596 const unsigned NumBytes = std::max<unsigned>(
7597 a: LinkageSize + MinParameterSaveAreaSize, b: CCInfo.getStackSize());
7598
7599 // Adjust the stack pointer for the new arguments...
7600 // These operations are automatically eliminated by the prolog/epilog pass.
7601 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
7602 SDValue CallSeqStart = Chain;
7603
7604 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7605 SmallVector<SDValue, 8> MemOpChains;
7606
7607 // Set up a copy of the stack pointer for loading and storing any
7608 // arguments that may not fit in the registers available for argument
7609 // passing.
7610 const SDValue StackPtr = IsPPC64 ? DAG.getRegister(Reg: PPC::X1, VT: MVT::i64)
7611 : DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
7612
7613 for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
7614 const unsigned ValNo = ArgLocs[I].getValNo();
7615 SDValue Arg = OutVals[ValNo];
7616 ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
7617
7618 if (Flags.isByVal()) {
7619 const unsigned ByValSize = Flags.getByValSize();
7620
7621 // Nothing to do for zero-sized ByVals on the caller side.
7622 if (!ByValSize) {
7623 ++I;
7624 continue;
7625 }
7626
7627 auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
7628 return DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: PtrVT, Chain,
7629 Ptr: (LoadOffset != 0)
7630 ? DAG.getObjectPtrOffset(
7631 SL: dl, Ptr: Arg, Offset: TypeSize::getFixed(ExactSize: LoadOffset))
7632 : Arg,
7633 PtrInfo: MachinePointerInfo(), MemVT: VT);
7634 };
7635
7636 unsigned LoadOffset = 0;
7637
7638 // Initialize registers, which are fully occupied by the by-val argument.
7639 while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
7640 SDValue Load = GetLoad(PtrVT, LoadOffset);
7641 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7642 LoadOffset += PtrByteSize;
7643 const CCValAssign &ByValVA = ArgLocs[I++];
7644 assert(ByValVA.getValNo() == ValNo &&
7645 "Unexpected location for pass-by-value argument.");
7646 RegsToPass.push_back(Elt: std::make_pair(x: ByValVA.getLocReg(), y&: Load));
7647 }
7648
7649 if (LoadOffset == ByValSize)
7650 continue;
7651
7652 // There must be one more loc to handle the remainder.
7653 assert(ArgLocs[I].getValNo() == ValNo &&
7654 "Expected additional location for by-value argument.");
7655
7656 if (ArgLocs[I].isMemLoc()) {
7657 assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
7658 const CCValAssign &ByValVA = ArgLocs[I++];
7659 ISD::ArgFlagsTy MemcpyFlags = Flags;
7660 // Only memcpy the bytes that don't pass in register.
7661 MemcpyFlags.setByValSize(ByValSize - LoadOffset);
7662 Chain = CallSeqStart = createMemcpyOutsideCallSeq(
7663 Arg: (LoadOffset != 0) ? DAG.getObjectPtrOffset(
7664 SL: dl, Ptr: Arg, Offset: TypeSize::getFixed(ExactSize: LoadOffset))
7665 : Arg,
7666 PtrOff: DAG.getObjectPtrOffset(
7667 SL: dl, Ptr: StackPtr, Offset: TypeSize::getFixed(ExactSize: ByValVA.getLocMemOffset())),
7668 CallSeqStart, Flags: MemcpyFlags, DAG, dl);
7669 continue;
7670 }
7671
7672 // Initialize the final register residue.
7673 // Any residue that occupies the final by-val arg register must be
7674 // left-justified on AIX. Loads must be a power-of-2 size and cannot be
7675 // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
7676 // 2 and 1 byte loads.
7677 const unsigned ResidueBytes = ByValSize % PtrByteSize;
7678 assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
7679 "Unexpected register residue for by-value argument.");
7680 SDValue ResidueVal;
7681 for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
7682 const unsigned N = llvm::bit_floor(Value: ResidueBytes - Bytes);
7683 const MVT VT =
7684 N == 1 ? MVT::i8
7685 : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
7686 SDValue Load = GetLoad(VT, LoadOffset);
7687 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7688 LoadOffset += N;
7689 Bytes += N;
7690
7691 // By-val arguments are passed left-justfied in register.
7692 // Every load here needs to be shifted, otherwise a full register load
7693 // should have been used.
7694 assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
7695 "Unexpected load emitted during handling of pass-by-value "
7696 "argument.");
7697 unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
7698 EVT ShiftAmountTy =
7699 getShiftAmountTy(LHSTy: Load->getValueType(ResNo: 0), DL: DAG.getDataLayout());
7700 SDValue SHLAmt = DAG.getConstant(Val: NumSHLBits, DL: dl, VT: ShiftAmountTy);
7701 SDValue ShiftedLoad =
7702 DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: Load.getValueType(), N1: Load, N2: SHLAmt);
7703 ResidueVal = ResidueVal ? DAG.getNode(Opcode: ISD::OR, DL: dl, VT: PtrVT, N1: ResidueVal,
7704 N2: ShiftedLoad)
7705 : ShiftedLoad;
7706 }
7707
7708 const CCValAssign &ByValVA = ArgLocs[I++];
7709 RegsToPass.push_back(Elt: std::make_pair(x: ByValVA.getLocReg(), y&: ResidueVal));
7710 continue;
7711 }
7712
7713 CCValAssign &VA = ArgLocs[I++];
7714 const MVT LocVT = VA.getLocVT();
7715 const MVT ValVT = VA.getValVT();
7716
7717 switch (VA.getLocInfo()) {
7718 default:
7719 report_fatal_error(reason: "Unexpected argument extension type.");
7720 case CCValAssign::Full:
7721 break;
7722 case CCValAssign::ZExt:
7723 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7724 break;
7725 case CCValAssign::SExt:
7726 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7727 break;
7728 }
7729
7730 if (VA.isRegLoc() && !VA.needsCustom()) {
7731 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
7732 continue;
7733 }
7734
7735 // Vector arguments passed to VarArg functions need custom handling when
7736 // they are passed (at least partially) in GPRs.
7737 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {
7738 assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");
7739 // Store value to its stack slot.
7740 SDValue PtrOff =
7741 DAG.getConstant(Val: VA.getLocMemOffset(), DL: dl, VT: StackPtr.getValueType());
7742 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7743 SDValue Store =
7744 DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
7745 MemOpChains.push_back(Elt: Store);
7746 const unsigned OriginalValNo = VA.getValNo();
7747 // Then load the GPRs from the stack
7748 unsigned LoadOffset = 0;
7749 auto HandleCustomVecRegLoc = [&]() {
7750 assert(I != E && "Unexpected end of CCvalAssigns.");
7751 assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7752 "Expected custom RegLoc.");
7753 CCValAssign RegVA = ArgLocs[I++];
7754 assert(RegVA.getValNo() == OriginalValNo &&
7755 "Custom MemLoc ValNo and custom RegLoc ValNo must match.");
7756 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff,
7757 N2: DAG.getConstant(Val: LoadOffset, DL: dl, VT: PtrVT));
7758 SDValue Load = DAG.getLoad(VT: PtrVT, dl, Chain: Store, Ptr: Add, PtrInfo: MachinePointerInfo());
7759 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7760 RegsToPass.push_back(Elt: std::make_pair(x: RegVA.getLocReg(), y&: Load));
7761 LoadOffset += PtrByteSize;
7762 };
7763
7764 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7765 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7766 // R10.
7767 HandleCustomVecRegLoc();
7768 HandleCustomVecRegLoc();
7769
7770 if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7771 ArgLocs[I].getValNo() == OriginalValNo) {
7772 assert(!IsPPC64 &&
7773 "Only 2 custom RegLocs expected for 64-bit codegen.");
7774 HandleCustomVecRegLoc();
7775 HandleCustomVecRegLoc();
7776 }
7777
7778 continue;
7779 }
7780
7781 if (VA.isMemLoc()) {
7782 SDValue PtrOff =
7783 DAG.getConstant(Val: VA.getLocMemOffset(), DL: dl, VT: StackPtr.getValueType());
7784 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7785 MemOpChains.push_back(
7786 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff,
7787 PtrInfo: MachinePointerInfo::getStack(MF, Offset: VA.getLocMemOffset()),
7788 Alignment: Subtarget.getFrameLowering()->getStackAlign()));
7789
7790 continue;
7791 }
7792
7793 if (!ValVT.isFloatingPoint())
7794 report_fatal_error(
7795 reason: "Unexpected register handling for calling convention.");
7796
7797 // Custom handling is used for GPR initializations for vararg float
7798 // arguments.
7799 assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
7800 LocVT.isInteger() &&
7801 "Custom register handling only expected for VarArg.");
7802
7803 SDValue ArgAsInt =
7804 DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: ValVT.getSizeInBits()), V: Arg);
7805
7806 if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
7807 // f32 in 32-bit GPR
7808 // f64 in 64-bit GPR
7809 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ArgAsInt));
7810 else if (Arg.getValueType().getFixedSizeInBits() <
7811 LocVT.getFixedSizeInBits())
7812 // f32 in 64-bit GPR.
7813 RegsToPass.push_back(Elt: std::make_pair(
7814 x: VA.getLocReg(), y: DAG.getZExtOrTrunc(Op: ArgAsInt, DL: dl, VT: LocVT)));
7815 else {
7816 // f64 in two 32-bit GPRs
7817 // The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
7818 assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
7819 "Unexpected custom register for argument!");
7820 CCValAssign &GPR1 = VA;
7821 SDValue MSWAsI64 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: ArgAsInt,
7822 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i8));
7823 RegsToPass.push_back(Elt: std::make_pair(
7824 x: GPR1.getLocReg(), y: DAG.getZExtOrTrunc(Op: MSWAsI64, DL: dl, VT: MVT::i32)));
7825
7826 if (I != E) {
7827 // If only 1 GPR was available, there will only be one custom GPR and
7828 // the argument will also pass in memory.
7829 CCValAssign &PeekArg = ArgLocs[I];
7830 if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
7831 assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
7832 CCValAssign &GPR2 = ArgLocs[I++];
7833 RegsToPass.push_back(Elt: std::make_pair(
7834 x: GPR2.getLocReg(), y: DAG.getZExtOrTrunc(Op: ArgAsInt, DL: dl, VT: MVT::i32)));
7835 }
7836 }
7837 }
7838 }
7839
7840 if (!MemOpChains.empty())
7841 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
7842
7843 // For indirect calls, we need to save the TOC base to the stack for
7844 // restoration after the call.
7845 if (CFlags.IsIndirect && !Subtarget.usePointerGlueHelper()) {
7846 assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
7847 const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
7848 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
7849 const MVT PtrVT = Subtarget.getScalarIntVT();
7850 const unsigned TOCSaveOffset =
7851 Subtarget.getFrameLowering()->getTOCSaveOffset();
7852
7853 setUsesTOCBasePtr(DAG);
7854 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: TOCBaseReg, VT: PtrVT);
7855 SDValue PtrOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
7856 SDValue StackPtr = DAG.getRegister(Reg: StackPtrReg, VT: PtrVT);
7857 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7858 Chain = DAG.getStore(
7859 Chain: Val.getValue(R: 1), dl, Val, Ptr: AddPtr,
7860 PtrInfo: MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset: TOCSaveOffset));
7861 }
7862
7863 // Build a sequence of copy-to-reg nodes chained together with token chain
7864 // and flag operands which copy the outgoing args into the appropriate regs.
7865 SDValue InGlue;
7866 for (auto Reg : RegsToPass) {
7867 Chain = DAG.getCopyToReg(Chain, dl, Reg: Reg.first, N: Reg.second, Glue: InGlue);
7868 InGlue = Chain.getValue(R: 1);
7869 }
7870
7871 const int SPDiff = 0;
7872 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
7873 Callee, SPDiff, NumBytes, Ins, InVals, CB);
7874}
7875
7876bool
7877PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
7878 MachineFunction &MF, bool isVarArg,
7879 const SmallVectorImpl<ISD::OutputArg> &Outs,
7880 LLVMContext &Context,
7881 const Type *RetTy) const {
7882 SmallVector<CCValAssign, 16> RVLocs;
7883 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7884 return CCInfo.CheckReturn(
7885 Outs, Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7886 ? RetCC_PPC_Cold
7887 : RetCC_PPC);
7888}
7889
7890SDValue
7891PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7892 bool isVarArg,
7893 const SmallVectorImpl<ISD::OutputArg> &Outs,
7894 const SmallVectorImpl<SDValue> &OutVals,
7895 const SDLoc &dl, SelectionDAG &DAG) const {
7896 SmallVector<CCValAssign, 16> RVLocs;
7897 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
7898 *DAG.getContext());
7899 CCInfo.AnalyzeReturn(Outs,
7900 Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7901 ? RetCC_PPC_Cold
7902 : RetCC_PPC);
7903
7904 SDValue Glue;
7905 SmallVector<SDValue, 4> RetOps(1, Chain);
7906
7907 // Copy the result values into the output registers.
7908 for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
7909 CCValAssign &VA = RVLocs[i];
7910 assert(VA.isRegLoc() && "Can only return in registers!");
7911
7912 SDValue Arg = OutVals[RealResIdx];
7913
7914 switch (VA.getLocInfo()) {
7915 default: llvm_unreachable("Unknown loc info!");
7916 case CCValAssign::Full: break;
7917 case CCValAssign::AExt:
7918 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7919 break;
7920 case CCValAssign::ZExt:
7921 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7922 break;
7923 case CCValAssign::SExt:
7924 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7925 break;
7926 }
7927 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
7928 bool isLittleEndian = Subtarget.isLittleEndian();
7929 // Legalize ret f64 -> ret 2 x i32.
7930 SDValue SVal =
7931 DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
7932 N2: DAG.getIntPtrConstant(Val: isLittleEndian ? 0 : 1, DL: dl));
7933 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: SVal, Glue);
7934 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
7935 SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
7936 N2: DAG.getIntPtrConstant(Val: isLittleEndian ? 1 : 0, DL: dl));
7937 Glue = Chain.getValue(R: 1);
7938 VA = RVLocs[++i]; // skip ahead to next loc
7939 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: SVal, Glue);
7940 } else
7941 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: Arg, Glue);
7942 Glue = Chain.getValue(R: 1);
7943 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
7944 }
7945
7946 RetOps[0] = Chain; // Update chain.
7947
7948 // Add the glue if we have it.
7949 if (Glue.getNode())
7950 RetOps.push_back(Elt: Glue);
7951
7952 return DAG.getNode(Opcode: PPCISD::RET_GLUE, DL: dl, VT: MVT::Other, Ops: RetOps);
7953}
7954
7955SDValue
7956PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
7957 SelectionDAG &DAG) const {
7958 SDLoc dl(Op);
7959
7960 // Get the correct type for integers.
7961 EVT IntVT = Op.getValueType();
7962
7963 // Get the inputs.
7964 SDValue Chain = Op.getOperand(i: 0);
7965 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7966 // Build a DYNAREAOFFSET node.
7967 SDValue Ops[2] = {Chain, FPSIdx};
7968 SDVTList VTs = DAG.getVTList(VT: IntVT);
7969 return DAG.getNode(Opcode: PPCISD::DYNAREAOFFSET, DL: dl, VTList: VTs, Ops);
7970}
7971
7972SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
7973 SelectionDAG &DAG) const {
7974 // When we pop the dynamic allocation we need to restore the SP link.
7975 SDLoc dl(Op);
7976
7977 // Get the correct type for pointers.
7978 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7979
7980 // Construct the stack pointer operand.
7981 bool isPPC64 = Subtarget.isPPC64();
7982 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
7983 SDValue StackPtr = DAG.getRegister(Reg: SP, VT: PtrVT);
7984
7985 // Get the operands for the STACKRESTORE.
7986 SDValue Chain = Op.getOperand(i: 0);
7987 SDValue SaveSP = Op.getOperand(i: 1);
7988
7989 // Load the old link SP.
7990 SDValue LoadLinkSP =
7991 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: StackPtr, PtrInfo: MachinePointerInfo());
7992
7993 // Restore the stack pointer.
7994 Chain = DAG.getCopyToReg(Chain: LoadLinkSP.getValue(R: 1), dl, Reg: SP, N: SaveSP);
7995
7996 // Store the old link SP.
7997 return DAG.getStore(Chain, dl, Val: LoadLinkSP, Ptr: StackPtr, PtrInfo: MachinePointerInfo());
7998}
7999
8000SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
8001 MachineFunction &MF = DAG.getMachineFunction();
8002 bool isPPC64 = Subtarget.isPPC64();
8003 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
8004
8005 // Get current frame pointer save index. The users of this index will be
8006 // primarily DYNALLOC instructions.
8007 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
8008 int RASI = FI->getReturnAddrSaveIndex();
8009
8010 // If the frame pointer save index hasn't been defined yet.
8011 if (!RASI) {
8012 // Find out what the fix offset of the frame pointer save area.
8013 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
8014 // Allocate the frame index for frame pointer save area.
8015 RASI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64? 8 : 4, SPOffset: LROffset, IsImmutable: false);
8016 // Save the result.
8017 FI->setReturnAddrSaveIndex(RASI);
8018 }
8019 return DAG.getFrameIndex(FI: RASI, VT: PtrVT);
8020}
8021
8022SDValue
8023PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
8024 MachineFunction &MF = DAG.getMachineFunction();
8025 bool isPPC64 = Subtarget.isPPC64();
8026 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
8027
8028 // Get current frame pointer save index. The users of this index will be
8029 // primarily DYNALLOC instructions.
8030 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
8031 int FPSI = FI->getFramePointerSaveIndex();
8032
8033 // If the frame pointer save index hasn't been defined yet.
8034 if (!FPSI) {
8035 // Find out what the fix offset of the frame pointer save area.
8036 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
8037 // Allocate the frame index for frame pointer save area.
8038 FPSI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64? 8 : 4, SPOffset: FPOffset, IsImmutable: true);
8039 // Save the result.
8040 FI->setFramePointerSaveIndex(FPSI);
8041 }
8042 return DAG.getFrameIndex(FI: FPSI, VT: PtrVT);
8043}
8044
8045SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
8046 SelectionDAG &DAG) const {
8047 MachineFunction &MF = DAG.getMachineFunction();
8048 // Get the inputs.
8049 SDValue Chain = Op.getOperand(i: 0);
8050 SDValue Size = Op.getOperand(i: 1);
8051 SDLoc dl(Op);
8052
8053 // Get the correct type for pointers.
8054 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8055 // Negate the size.
8056 SDValue NegSize = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: PtrVT,
8057 N1: DAG.getConstant(Val: 0, DL: dl, VT: PtrVT), N2: Size);
8058 // Construct a node for the frame pointer save index.
8059 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
8060 SDValue Ops[3] = { Chain, NegSize, FPSIdx };
8061 SDVTList VTs = DAG.getVTList(VT1: PtrVT, VT2: MVT::Other);
8062 if (hasInlineStackProbe(MF))
8063 return DAG.getNode(Opcode: PPCISD::PROBED_ALLOCA, DL: dl, VTList: VTs, Ops);
8064 return DAG.getNode(Opcode: PPCISD::DYNALLOC, DL: dl, VTList: VTs, Ops);
8065}
8066
8067SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
8068 SelectionDAG &DAG) const {
8069 MachineFunction &MF = DAG.getMachineFunction();
8070
8071 bool isPPC64 = Subtarget.isPPC64();
8072 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8073
8074 int FI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64 ? 8 : 4, SPOffset: 0, IsImmutable: false);
8075 return DAG.getFrameIndex(FI, VT: PtrVT);
8076}
8077
8078SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
8079 SelectionDAG &DAG) const {
8080 SDLoc DL(Op);
8081 return DAG.getNode(Opcode: PPCISD::EH_SJLJ_SETJMP, DL,
8082 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other),
8083 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
8084}
8085
8086SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
8087 SelectionDAG &DAG) const {
8088 SDLoc DL(Op);
8089 return DAG.getNode(Opcode: PPCISD::EH_SJLJ_LONGJMP, DL, VT: MVT::Other,
8090 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
8091}
8092
8093SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
8094 if (Op.getValueType().isVector())
8095 return LowerVectorLoad(Op, DAG);
8096
8097 assert(Op.getValueType() == MVT::i1 &&
8098 "Custom lowering only for i1 loads");
8099
8100 // First, load 8 bits into 32 bits, then truncate to 1 bit.
8101
8102 SDLoc dl(Op);
8103 LoadSDNode *LD = cast<LoadSDNode>(Val&: Op);
8104
8105 SDValue Chain = LD->getChain();
8106 SDValue BasePtr = LD->getBasePtr();
8107 MachineMemOperand *MMO = LD->getMemOperand();
8108
8109 SDValue NewLD =
8110 DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: getPointerTy(DL: DAG.getDataLayout()), Chain,
8111 Ptr: BasePtr, MemVT: MVT::i8, MMO);
8112 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: NewLD);
8113
8114 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
8115 return DAG.getMergeValues(Ops, dl);
8116}
8117
8118SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
8119 if (Op.getOperand(i: 1).getValueType().isVector())
8120 return LowerVectorStore(Op, DAG);
8121
8122 assert(Op.getOperand(1).getValueType() == MVT::i1 &&
8123 "Custom lowering only for i1 stores");
8124
8125 // First, zero extend to 32 bits, then use a truncating store to 8 bits.
8126
8127 SDLoc dl(Op);
8128 StoreSDNode *ST = cast<StoreSDNode>(Val&: Op);
8129
8130 SDValue Chain = ST->getChain();
8131 SDValue BasePtr = ST->getBasePtr();
8132 SDValue Value = ST->getValue();
8133 MachineMemOperand *MMO = ST->getMemOperand();
8134
8135 Value = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
8136 Operand: Value);
8137 return DAG.getTruncStore(Chain, dl, Val: Value, Ptr: BasePtr, SVT: MVT::i8, MMO);
8138}
8139
8140// FIXME: Remove this once the ANDI glue bug is fixed:
8141SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8142 assert(Op.getValueType() == MVT::i1 &&
8143 "Custom lowering only for i1 results");
8144
8145 SDLoc DL(Op);
8146 return DAG.getNode(Opcode: PPCISD::ANDI_rec_1_GT_BIT, DL, VT: MVT::i1, Operand: Op.getOperand(i: 0));
8147}
8148
8149SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
8150 SelectionDAG &DAG) const {
8151
8152 // Implements a vector truncate that fits in a vector register as a shuffle.
8153 // We want to legalize vector truncates down to where the source fits in
8154 // a vector register (and target is therefore smaller than vector register
8155 // size). At that point legalization will try to custom lower the sub-legal
8156 // result and get here - where we can contain the truncate as a single target
8157 // operation.
8158
8159 // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
8160 // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
8161 //
8162 // We will implement it for big-endian ordering as this (where x denotes
8163 // undefined):
8164 // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
8165 // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
8166 //
8167 // The same operation in little-endian ordering will be:
8168 // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
8169 // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
8170
8171 EVT TrgVT = Op.getValueType();
8172 assert(TrgVT.isVector() && "Vector type expected.");
8173 unsigned TrgNumElts = TrgVT.getVectorNumElements();
8174 EVT EltVT = TrgVT.getVectorElementType();
8175 if (!isOperationCustom(Op: Op.getOpcode(), VT: TrgVT) ||
8176 TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(Value: TrgNumElts) ||
8177 !llvm::has_single_bit<uint32_t>(Value: EltVT.getSizeInBits()))
8178 return SDValue();
8179
8180 SDValue N1 = Op.getOperand(i: 0);
8181 EVT SrcVT = N1.getValueType();
8182 unsigned SrcSize = SrcVT.getSizeInBits();
8183 if (SrcSize > 256 || !isPowerOf2_32(Value: SrcVT.getVectorNumElements()) ||
8184 !llvm::has_single_bit<uint32_t>(
8185 Value: SrcVT.getVectorElementType().getSizeInBits()))
8186 return SDValue();
8187 if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
8188 return SDValue();
8189
8190 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8191 EVT WideVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: WideNumElts);
8192
8193 SDLoc DL(Op);
8194 SDValue Op1, Op2;
8195 if (SrcSize == 256) {
8196 EVT VecIdxTy = getVectorIdxTy(DL: DAG.getDataLayout());
8197 EVT SplitVT =
8198 N1.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
8199 unsigned SplitNumElts = SplitVT.getVectorNumElements();
8200 Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SplitVT, N1,
8201 N2: DAG.getConstant(Val: 0, DL, VT: VecIdxTy));
8202 Op2 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SplitVT, N1,
8203 N2: DAG.getConstant(Val: SplitNumElts, DL, VT: VecIdxTy));
8204 }
8205 else {
8206 Op1 = SrcSize == 128 ? N1 : widenVec(DAG, Vec: N1, dl: DL);
8207 Op2 = DAG.getUNDEF(VT: WideVT);
8208 }
8209
8210 // First list the elements we want to keep.
8211 unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
8212 SmallVector<int, 16> ShuffV;
8213 if (Subtarget.isLittleEndian())
8214 for (unsigned i = 0; i < TrgNumElts; ++i)
8215 ShuffV.push_back(Elt: i * SizeMult);
8216 else
8217 for (unsigned i = 1; i <= TrgNumElts; ++i)
8218 ShuffV.push_back(Elt: i * SizeMult - 1);
8219
8220 // Populate the remaining elements with undefs.
8221 for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
8222 // ShuffV.push_back(i + WideNumElts);
8223 ShuffV.push_back(Elt: WideNumElts + 1);
8224
8225 Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVT, Operand: Op1);
8226 Op2 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVT, Operand: Op2);
8227 return DAG.getVectorShuffle(VT: WideVT, dl: DL, N1: Op1, N2: Op2, Mask: ShuffV);
8228}
8229
8230/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
8231/// possible.
8232SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
8233 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
8234 EVT ResVT = Op.getValueType();
8235 EVT CmpVT = Op.getOperand(i: 0).getValueType();
8236 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
8237 SDValue TV = Op.getOperand(i: 2), FV = Op.getOperand(i: 3);
8238 SDLoc dl(Op);
8239
8240 // Without power9-vector, we don't have native instruction for f128 comparison.
8241 // Following transformation to libcall is needed for setcc:
8242 // select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE
8243 if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {
8244 SDValue Z = DAG.getSetCC(
8245 DL: dl, VT: getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: CmpVT),
8246 LHS, RHS, Cond: CC);
8247 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: Z.getValueType());
8248 return DAG.getSelectCC(DL: dl, LHS: Z, RHS: Zero, True: TV, False: FV, Cond: ISD::SETNE);
8249 }
8250
8251 // Not FP, or using SPE? Not a fsel.
8252 if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||
8253 Subtarget.hasSPE())
8254 return Op;
8255
8256 SDNodeFlags Flags = Op.getNode()->getFlags();
8257
8258 // We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the
8259 // presence of infinities.
8260 if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
8261 switch (CC) {
8262 default:
8263 break;
8264 case ISD::SETOGT:
8265 case ISD::SETGT:
8266 return DAG.getNode(Opcode: PPCISD::XSMAXC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
8267 case ISD::SETOLT:
8268 case ISD::SETLT:
8269 return DAG.getNode(Opcode: PPCISD::XSMINC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
8270 }
8271 }
8272
8273 // We might be able to do better than this under some circumstances, but in
8274 // general, fsel-based lowering of select is a finite-math-only optimization.
8275 // For more information, see section F.3 of the 2.06 ISA specification.
8276 // With ISA 3.0
8277 if (!Flags.hasNoInfs() || !Flags.hasNoNaNs() || ResVT == MVT::f128)
8278 return Op;
8279
8280 // If the RHS of the comparison is a 0.0, we don't need to do the
8281 // subtraction at all.
8282 SDValue Sel1;
8283 if (isFloatingPointZero(Op: RHS))
8284 switch (CC) {
8285 default: break; // SETUO etc aren't handled by fsel.
8286 case ISD::SETNE:
8287 std::swap(a&: TV, b&: FV);
8288 [[fallthrough]];
8289 case ISD::SETEQ:
8290 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8291 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8292 Sel1 = DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: LHS, N2: TV, N3: FV);
8293 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8294 Sel1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Sel1);
8295 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8296 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: LHS), N2: Sel1, N3: FV);
8297 case ISD::SETULT:
8298 case ISD::SETLT:
8299 std::swap(a&: TV, b&: FV); // fsel is natively setge, swap operands for setlt
8300 [[fallthrough]];
8301 case ISD::SETOGE:
8302 case ISD::SETGE:
8303 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8304 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8305 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: LHS, N2: TV, N3: FV);
8306 case ISD::SETUGT:
8307 case ISD::SETGT:
8308 std::swap(a&: TV, b&: FV); // fsel is natively setge, swap operands for setlt
8309 [[fallthrough]];
8310 case ISD::SETOLE:
8311 case ISD::SETLE:
8312 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8313 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8314 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8315 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: LHS), N2: TV, N3: FV);
8316 }
8317
8318 SDValue Cmp;
8319 switch (CC) {
8320 default: break; // SETUO etc aren't handled by fsel.
8321 case ISD::SETNE:
8322 std::swap(a&: TV, b&: FV);
8323 [[fallthrough]];
8324 case ISD::SETEQ:
8325 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8326 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8327 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8328 Sel1 = DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8329 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8330 Sel1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Sel1);
8331 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8332 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: Cmp), N2: Sel1, N3: FV);
8333 case ISD::SETULT:
8334 case ISD::SETLT:
8335 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8336 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8337 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8338 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: FV, N3: TV);
8339 case ISD::SETOGE:
8340 case ISD::SETGE:
8341 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8342 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8343 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8344 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8345 case ISD::SETUGT:
8346 case ISD::SETGT:
8347 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: RHS, N2: LHS, Flags);
8348 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8349 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8350 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: FV, N3: TV);
8351 case ISD::SETOLE:
8352 case ISD::SETLE:
8353 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: RHS, N2: LHS, Flags);
8354 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8355 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8356 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8357 }
8358 return Op;
8359}
8360
8361static unsigned getPPCStrictOpcode(unsigned Opc) {
8362 switch (Opc) {
8363 default:
8364 llvm_unreachable("No strict version of this opcode!");
8365 case PPCISD::FCTIDZ:
8366 return PPCISD::STRICT_FCTIDZ;
8367 case PPCISD::FCTIWZ:
8368 return PPCISD::STRICT_FCTIWZ;
8369 case PPCISD::FCTIDUZ:
8370 return PPCISD::STRICT_FCTIDUZ;
8371 case PPCISD::FCTIWUZ:
8372 return PPCISD::STRICT_FCTIWUZ;
8373 case PPCISD::FCFID:
8374 return PPCISD::STRICT_FCFID;
8375 case PPCISD::FCFIDU:
8376 return PPCISD::STRICT_FCFIDU;
8377 case PPCISD::FCFIDS:
8378 return PPCISD::STRICT_FCFIDS;
8379 case PPCISD::FCFIDUS:
8380 return PPCISD::STRICT_FCFIDUS;
8381 }
8382}
8383
8384static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
8385 const PPCSubtarget &Subtarget) {
8386 SDLoc dl(Op);
8387 bool IsStrict = Op->isStrictFPOpcode();
8388 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8389 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8390
8391 // TODO: Any other flags to propagate?
8392 SDNodeFlags Flags;
8393 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8394
8395 // For strict nodes, source is the second operand.
8396 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8397 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
8398 MVT DestTy = Op.getSimpleValueType();
8399 assert(Src.getValueType().isFloatingPoint() &&
8400 (DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 ||
8401 DestTy == MVT::i64) &&
8402 "Invalid FP_TO_INT types");
8403 if (Src.getValueType() == MVT::f32) {
8404 if (IsStrict) {
8405 Src =
8406 DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl,
8407 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other), Ops: {Chain, Src}, Flags);
8408 Chain = Src.getValue(R: 1);
8409 } else
8410 Src = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Src);
8411 }
8412 if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector())
8413 DestTy = Subtarget.getScalarIntVT();
8414 unsigned Opc = ISD::DELETED_NODE;
8415 switch (DestTy.SimpleTy) {
8416 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
8417 case MVT::i32:
8418 Opc = IsSigned ? PPCISD::FCTIWZ
8419 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
8420 break;
8421 case MVT::i64:
8422 assert((IsSigned || Subtarget.hasFPCVT()) &&
8423 "i64 FP_TO_UINT is supported only with FPCVT");
8424 Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
8425 }
8426 EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64;
8427 SDValue Conv;
8428 if (IsStrict) {
8429 Opc = getPPCStrictOpcode(Opc);
8430 Conv = DAG.getNode(Opcode: Opc, DL: dl, VTList: DAG.getVTList(VT1: ConvTy, VT2: MVT::Other), Ops: {Chain, Src},
8431 Flags);
8432 } else {
8433 Conv = DAG.getNode(Opcode: Opc, DL: dl, VT: ConvTy, Operand: Src);
8434 }
8435 return Conv;
8436}
8437
8438void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
8439 SelectionDAG &DAG,
8440 const SDLoc &dl) const {
8441 SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
8442 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8443 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8444 bool IsStrict = Op->isStrictFPOpcode();
8445
8446 // Convert the FP value to an int value through memory.
8447 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
8448 (IsSigned || Subtarget.hasFPCVT());
8449 SDValue FIPtr = DAG.CreateStackTemporary(VT: i32Stack ? MVT::i32 : MVT::f64);
8450 int FI = cast<FrameIndexSDNode>(Val&: FIPtr)->getIndex();
8451 MachinePointerInfo MPI =
8452 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI);
8453
8454 // Emit a store to the stack slot.
8455 SDValue Chain = IsStrict ? Tmp.getValue(R: 1) : DAG.getEntryNode();
8456 Align Alignment(DAG.getEVTAlign(MemoryVT: Tmp.getValueType()));
8457 if (i32Stack) {
8458 MachineFunction &MF = DAG.getMachineFunction();
8459 Alignment = Align(4);
8460 MachineMemOperand *MMO =
8461 MF.getMachineMemOperand(PtrInfo: MPI, F: MachineMemOperand::MOStore, Size: 4, BaseAlignment: Alignment);
8462 SDValue Ops[] = { Chain, Tmp, FIPtr };
8463 Chain = DAG.getMemIntrinsicNode(Opcode: PPCISD::STFIWX, dl,
8464 VTList: DAG.getVTList(VT: MVT::Other), Ops, MemVT: MVT::i32, MMO);
8465 } else
8466 Chain = DAG.getStore(Chain, dl, Val: Tmp, Ptr: FIPtr, PtrInfo: MPI, Alignment);
8467
8468 // Result is a load from the stack slot. If loading 4 bytes, make sure to
8469 // add in a bias on big endian.
8470 if (Op.getValueType() == MVT::i32 && !i32Stack &&
8471 !Subtarget.isLittleEndian()) {
8472 FIPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: FIPtr.getValueType(), N1: FIPtr,
8473 N2: DAG.getConstant(Val: 4, DL: dl, VT: FIPtr.getValueType()));
8474 MPI = MPI.getWithOffset(O: 4);
8475 }
8476
8477 RLI.Chain = Chain;
8478 RLI.Ptr = FIPtr;
8479 RLI.MPI = MPI;
8480 RLI.Alignment = Alignment;
8481}
8482
8483/// Custom lowers floating point to integer conversions to use
8484/// the direct move instructions available in ISA 2.07 to avoid the
8485/// need for load/store combinations.
8486SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
8487 SelectionDAG &DAG,
8488 const SDLoc &dl) const {
8489 SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
8490 SDValue Mov = DAG.getNode(Opcode: PPCISD::MFVSR, DL: dl, VT: Op.getValueType(), Operand: Conv);
8491 if (Op->isStrictFPOpcode())
8492 return DAG.getMergeValues(Ops: {Mov, Conv.getValue(R: 1)}, dl);
8493 else
8494 return Mov;
8495}
8496
8497SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
8498 const SDLoc &dl) const {
8499 bool IsStrict = Op->isStrictFPOpcode();
8500 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8501 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8502 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8503 EVT SrcVT = Src.getValueType();
8504 EVT DstVT = Op.getValueType();
8505
8506 // FP to INT conversions are legal for f128.
8507 if (SrcVT == MVT::f128)
8508 return Subtarget.hasP9Vector() ? Op : SDValue();
8509
8510 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
8511 // PPC (the libcall is not available).
8512 if (SrcVT == MVT::ppcf128) {
8513 if (DstVT == MVT::i32) {
8514 // TODO: Conservatively pass only nofpexcept flag here. Need to check and
8515 // set other fast-math flags to FP operations in both strict and
8516 // non-strict cases. (FP_TO_SINT, FSUB)
8517 SDNodeFlags Flags;
8518 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8519
8520 if (IsSigned) {
8521 SDValue Lo, Hi;
8522 std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Src, DL: dl, LoVT: MVT::f64, HiVT: MVT::f64);
8523
8524 // Add the two halves of the long double in round-to-zero mode, and use
8525 // a smaller FP_TO_SINT.
8526 if (IsStrict) {
8527 SDValue Res = DAG.getNode(Opcode: PPCISD::STRICT_FADDRTZ, DL: dl,
8528 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8529 Ops: {Op.getOperand(i: 0), Lo, Hi}, Flags);
8530 return DAG.getNode(Opcode: ISD::STRICT_FP_TO_SINT, DL: dl,
8531 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other),
8532 Ops: {Res.getValue(R: 1), Res}, Flags);
8533 } else {
8534 SDValue Res = DAG.getNode(Opcode: PPCISD::FADDRTZ, DL: dl, VT: MVT::f64, N1: Lo, N2: Hi);
8535 return DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: Res);
8536 }
8537 } else {
8538 const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
8539 APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
8540 SDValue Cst = DAG.getConstantFP(Val: APF, DL: dl, VT: SrcVT);
8541 SDValue SignMask = DAG.getConstant(Val: 0x80000000, DL: dl, VT: DstVT);
8542 if (IsStrict) {
8543 // Sel = Src < 0x80000000
8544 // FltOfs = select Sel, 0.0, 0x80000000
8545 // IntOfs = select Sel, 0, 0x80000000
8546 // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
8547 SDValue Chain = Op.getOperand(i: 0);
8548 EVT SetCCVT =
8549 getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT);
8550 EVT DstSetCCVT =
8551 getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: DstVT);
8552 SDValue Sel = DAG.getSetCC(DL: dl, VT: SetCCVT, LHS: Src, RHS: Cst, Cond: ISD::SETLT,
8553 Chain, IsSignaling: true);
8554 Chain = Sel.getValue(R: 1);
8555
8556 SDValue FltOfs = DAG.getSelect(
8557 DL: dl, VT: SrcVT, Cond: Sel, LHS: DAG.getConstantFP(Val: 0.0, DL: dl, VT: SrcVT), RHS: Cst);
8558 Sel = DAG.getBoolExtOrTrunc(Op: Sel, SL: dl, VT: DstSetCCVT, OpVT: DstVT);
8559
8560 SDValue Val = DAG.getNode(Opcode: ISD::STRICT_FSUB, DL: dl,
8561 VTList: DAG.getVTList(VT1: SrcVT, VT2: MVT::Other),
8562 Ops: {Chain, Src, FltOfs}, Flags);
8563 Chain = Val.getValue(R: 1);
8564 SDValue SInt = DAG.getNode(Opcode: ISD::STRICT_FP_TO_SINT, DL: dl,
8565 VTList: DAG.getVTList(VT1: DstVT, VT2: MVT::Other),
8566 Ops: {Chain, Val}, Flags);
8567 Chain = SInt.getValue(R: 1);
8568 SDValue IntOfs = DAG.getSelect(
8569 DL: dl, VT: DstVT, Cond: Sel, LHS: DAG.getConstant(Val: 0, DL: dl, VT: DstVT), RHS: SignMask);
8570 SDValue Result = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: DstVT, N1: SInt, N2: IntOfs);
8571 return DAG.getMergeValues(Ops: {Result, Chain}, dl);
8572 } else {
8573 // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
8574 // FIXME: generated code sucks.
8575 SDValue True = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: MVT::ppcf128, N1: Src, N2: Cst);
8576 True = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: True);
8577 True = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: True, N2: SignMask);
8578 SDValue False = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: Src);
8579 return DAG.getSelectCC(DL: dl, LHS: Src, RHS: Cst, True, False, Cond: ISD::SETGE);
8580 }
8581 }
8582 }
8583
8584 return SDValue();
8585 }
8586
8587 if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
8588 return LowerFP_TO_INTDirectMove(Op, DAG, dl);
8589
8590 ReuseLoadInfo RLI;
8591 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8592
8593 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: RLI.Chain, Ptr: RLI.Ptr, PtrInfo: RLI.MPI,
8594 Alignment: RLI.Alignment, MMOFlags: RLI.MMOFlags(), AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8595}
8596
8597// We're trying to insert a regular store, S, and then a load, L. If the
8598// incoming value, O, is a load, we might just be able to have our load use the
8599// address used by O. However, we don't know if anything else will store to
8600// that address before we can load from it. To prevent this situation, we need
8601// to insert our load, L, into the chain as a peer of O. To do this, we give L
8602// the same chain operand as O, we create a token factor from the chain results
8603// of O and L, and we replace all uses of O's chain result with that token
8604// factor (this last part is handled by makeEquivalentMemoryOrdering).
8605bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
8606 ReuseLoadInfo &RLI,
8607 SelectionDAG &DAG,
8608 ISD::LoadExtType ET) const {
8609 // Conservatively skip reusing for constrained FP nodes.
8610 if (Op->isStrictFPOpcode())
8611 return false;
8612
8613 SDLoc dl(Op);
8614 bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
8615 (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
8616 if (ET == ISD::NON_EXTLOAD &&
8617 (ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
8618 isOperationLegalOrCustom(Op: Op.getOpcode(),
8619 VT: Op.getOperand(i: 0).getValueType())) {
8620
8621 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8622 return true;
8623 }
8624
8625 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: Op);
8626 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
8627 LD->isNonTemporal())
8628 return false;
8629 if (LD->getMemoryVT() != MemVT)
8630 return false;
8631
8632 // If the result of the load is an illegal type, then we can't build a
8633 // valid chain for reuse since the legalised loads and token factor node that
8634 // ties the legalised loads together uses a different output chain then the
8635 // illegal load.
8636 if (!isTypeLegal(VT: LD->getValueType(ResNo: 0)))
8637 return false;
8638
8639 RLI.Ptr = LD->getBasePtr();
8640 if (LD->isIndexed() && !LD->getOffset().isUndef()) {
8641 assert(LD->getAddressingMode() == ISD::PRE_INC &&
8642 "Non-pre-inc AM on PPC?");
8643 RLI.Ptr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RLI.Ptr.getValueType(), N1: RLI.Ptr,
8644 N2: LD->getOffset());
8645 }
8646
8647 RLI.Chain = LD->getChain();
8648 RLI.MPI = LD->getPointerInfo();
8649 RLI.IsDereferenceable = LD->isDereferenceable();
8650 RLI.IsInvariant = LD->isInvariant();
8651 RLI.Alignment = LD->getAlign();
8652 RLI.AAInfo = LD->getAAInfo();
8653 RLI.Ranges = LD->getRanges();
8654
8655 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
8656 return true;
8657}
8658
8659/// Analyze profitability of direct move
8660/// prefer float load to int load plus direct move
8661/// when there is no integer use of int load
8662bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
8663 SDNode *Origin = Op.getOperand(i: Op->isStrictFPOpcode() ? 1 : 0).getNode();
8664 if (Origin->getOpcode() != ISD::LOAD)
8665 return true;
8666
8667 // If there is no LXSIBZX/LXSIHZX, like Power8,
8668 // prefer direct move if the memory size is 1 or 2 bytes.
8669 MachineMemOperand *MMO = cast<LoadSDNode>(Val: Origin)->getMemOperand();
8670 if (!Subtarget.hasP9Vector() &&
8671 (!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
8672 return true;
8673
8674 for (SDUse &Use : Origin->uses()) {
8675
8676 // Only look at the users of the loaded value.
8677 if (Use.getResNo() != 0)
8678 continue;
8679
8680 SDNode *User = Use.getUser();
8681 if (User->getOpcode() != ISD::SINT_TO_FP &&
8682 User->getOpcode() != ISD::UINT_TO_FP &&
8683 User->getOpcode() != ISD::STRICT_SINT_TO_FP &&
8684 User->getOpcode() != ISD::STRICT_UINT_TO_FP)
8685 return true;
8686 }
8687
8688 return false;
8689}
8690
8691static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
8692 const PPCSubtarget &Subtarget,
8693 SDValue Chain = SDValue()) {
8694 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8695 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8696 SDLoc dl(Op);
8697
8698 // TODO: Any other flags to propagate?
8699 SDNodeFlags Flags;
8700 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8701
8702 // If we have FCFIDS, then use it when converting to single-precision.
8703 // Otherwise, convert to double-precision and then round.
8704 bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
8705 unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
8706 : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
8707 EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
8708 if (Op->isStrictFPOpcode()) {
8709 if (!Chain)
8710 Chain = Op.getOperand(i: 0);
8711 return DAG.getNode(Opcode: getPPCStrictOpcode(Opc: ConvOpc), DL: dl,
8712 VTList: DAG.getVTList(VT1: ConvTy, VT2: MVT::Other), Ops: {Chain, Src}, Flags);
8713 } else
8714 return DAG.getNode(Opcode: ConvOpc, DL: dl, VT: ConvTy, Operand: Src);
8715}
8716
8717/// Custom lowers integer to floating point conversions to use
8718/// the direct move instructions available in ISA 2.07 to avoid the
8719/// need for load/store combinations.
8720SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
8721 SelectionDAG &DAG,
8722 const SDLoc &dl) const {
8723 assert((Op.getValueType() == MVT::f32 ||
8724 Op.getValueType() == MVT::f64) &&
8725 "Invalid floating point type as target of conversion");
8726 assert(Subtarget.hasFPCVT() &&
8727 "Int to FP conversions with direct moves require FPCVT");
8728 SDValue Src = Op.getOperand(i: Op->isStrictFPOpcode() ? 1 : 0);
8729 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
8730 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
8731 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8732 unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
8733 SDValue Mov = DAG.getNode(Opcode: MovOpc, DL: dl, VT: MVT::f64, Operand: Src);
8734 return convertIntToFP(Op, Src: Mov, DAG, Subtarget);
8735}
8736
8737static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
8738
8739 EVT VecVT = Vec.getValueType();
8740 assert(VecVT.isVector() && "Expected a vector type.");
8741 assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
8742
8743 EVT EltVT = VecVT.getVectorElementType();
8744 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8745 EVT WideVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: WideNumElts);
8746
8747 unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
8748 SmallVector<SDValue, 16> Ops(NumConcat);
8749 Ops[0] = Vec;
8750 SDValue UndefVec = DAG.getUNDEF(VT: VecVT);
8751 for (unsigned i = 1; i < NumConcat; ++i)
8752 Ops[i] = UndefVec;
8753
8754 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: WideVT, Ops);
8755}
8756
8757SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
8758 const SDLoc &dl) const {
8759 bool IsStrict = Op->isStrictFPOpcode();
8760 unsigned Opc = Op.getOpcode();
8761 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8762 assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
8763 Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
8764 "Unexpected conversion type");
8765 assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
8766 "Supports conversions to v2f64/v4f32 only.");
8767
8768 // TODO: Any other flags to propagate?
8769 SDNodeFlags Flags;
8770 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8771
8772 bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
8773 bool FourEltRes = Op.getValueType() == MVT::v4f32;
8774
8775 SDValue Wide = widenVec(DAG, Vec: Src, dl);
8776 EVT WideVT = Wide.getValueType();
8777 unsigned WideNumElts = WideVT.getVectorNumElements();
8778 MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
8779
8780 SmallVector<int, 16> ShuffV;
8781 for (unsigned i = 0; i < WideNumElts; ++i)
8782 ShuffV.push_back(Elt: i + WideNumElts);
8783
8784 int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
8785 int SaveElts = FourEltRes ? 4 : 2;
8786 if (Subtarget.isLittleEndian())
8787 for (int i = 0; i < SaveElts; i++)
8788 ShuffV[i * Stride] = i;
8789 else
8790 for (int i = 1; i <= SaveElts; i++)
8791 ShuffV[i * Stride - 1] = i - 1;
8792
8793 SDValue ShuffleSrc2 =
8794 SignedConv ? DAG.getUNDEF(VT: WideVT) : DAG.getConstant(Val: 0, DL: dl, VT: WideVT);
8795 SDValue Arrange = DAG.getVectorShuffle(VT: WideVT, dl, N1: Wide, N2: ShuffleSrc2, Mask: ShuffV);
8796
8797 SDValue Extend;
8798 if (SignedConv) {
8799 Arrange = DAG.getBitcast(VT: IntermediateVT, V: Arrange);
8800 EVT ExtVT = Src.getValueType();
8801 if (Subtarget.hasP9Altivec())
8802 ExtVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: WideVT.getVectorElementType(),
8803 NumElements: IntermediateVT.getVectorNumElements());
8804
8805 Extend = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT: IntermediateVT, N1: Arrange,
8806 N2: DAG.getValueType(ExtVT));
8807 } else
8808 Extend = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntermediateVT, Operand: Arrange);
8809
8810 if (IsStrict)
8811 return DAG.getNode(Opcode: Opc, DL: dl, VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::Other),
8812 Ops: {Op.getOperand(i: 0), Extend}, Flags);
8813
8814 return DAG.getNode(Opcode: Opc, DL: dl, VT: Op.getValueType(), Operand: Extend);
8815}
8816
8817SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
8818 SelectionDAG &DAG) const {
8819 SDLoc dl(Op);
8820 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8821 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8822 bool IsStrict = Op->isStrictFPOpcode();
8823 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8824 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : DAG.getEntryNode();
8825
8826 // TODO: Any other flags to propagate?
8827 SDNodeFlags Flags;
8828 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8829
8830 EVT InVT = Src.getValueType();
8831 EVT OutVT = Op.getValueType();
8832 if (OutVT.isVector() && OutVT.isFloatingPoint() &&
8833 isOperationCustom(Op: Op.getOpcode(), VT: InVT))
8834 return LowerINT_TO_FPVector(Op, DAG, dl);
8835
8836 // Conversions to f128 are legal.
8837 if (Op.getValueType() == MVT::f128)
8838 return Subtarget.hasP9Vector() ? Op : SDValue();
8839
8840 // Don't handle ppc_fp128 here; let it be lowered to a libcall.
8841 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
8842 return SDValue();
8843
8844 if (Src.getValueType() == MVT::i1) {
8845 SDValue Sel = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: Op.getValueType(), N1: Src,
8846 N2: DAG.getConstantFP(Val: 1.0, DL: dl, VT: Op.getValueType()),
8847 N3: DAG.getConstantFP(Val: 0.0, DL: dl, VT: Op.getValueType()));
8848 if (IsStrict)
8849 return DAG.getMergeValues(Ops: {Sel, Chain}, dl);
8850 else
8851 return Sel;
8852 }
8853
8854 // If we have direct moves, we can do all the conversion, skip the store/load
8855 // however, without FPCVT we can't do most conversions.
8856 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
8857 Subtarget.isPPC64() && Subtarget.hasFPCVT())
8858 return LowerINT_TO_FPDirectMove(Op, DAG, dl);
8859
8860 assert((IsSigned || Subtarget.hasFPCVT()) &&
8861 "UINT_TO_FP is supported only with FPCVT");
8862
8863 if (Src.getValueType() == MVT::i64) {
8864 SDValue SINT = Src;
8865 // When converting to single-precision, we actually need to convert
8866 // to double-precision first and then round to single-precision.
8867 // To avoid double-rounding effects during that operation, we have
8868 // to prepare the input operand. Bits that might be truncated when
8869 // converting to double-precision are replaced by a bit that won't
8870 // be lost at this stage, but is below the single-precision rounding
8871 // position.
8872 //
8873 // However, if afn is in effect, accept double
8874 // rounding to avoid the extra overhead.
8875 // FIXME: Currently INT_TO_FP can't support fast math flags because
8876 // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
8877 // false.
8878 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
8879 !Op->getFlags().hasApproximateFuncs()) {
8880
8881 // Twiddle input to make sure the low 11 bits are zero. (If this
8882 // is the case, we are guaranteed the value will fit into the 53 bit
8883 // mantissa of an IEEE double-precision value without rounding.)
8884 // If any of those low 11 bits were not zero originally, make sure
8885 // bit 12 (value 2048) is set instead, so that the final rounding
8886 // to single-precision gets the correct result.
8887 SDValue Round = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i64,
8888 N1: SINT, N2: DAG.getConstant(Val: 2047, DL: dl, VT: MVT::i64));
8889 Round = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i64,
8890 N1: Round, N2: DAG.getConstant(Val: 2047, DL: dl, VT: MVT::i64));
8891 Round = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MVT::i64, N1: Round, N2: SINT);
8892 Round = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i64, N1: Round,
8893 N2: DAG.getSignedConstant(Val: -2048, DL: dl, VT: MVT::i64));
8894
8895 // However, we cannot use that value unconditionally: if the magnitude
8896 // of the input value is small, the bit-twiddling we did above might
8897 // end up visibly changing the output. Fortunately, in that case, we
8898 // don't need to twiddle bits since the original input will convert
8899 // exactly to double-precision floating-point already. Therefore,
8900 // construct a conditional to use the original value if the top 11
8901 // bits are all sign-bit copies, and use the rounded value computed
8902 // above otherwise.
8903 SDValue Cond = DAG.getNode(Opcode: ISD::SRA, DL: dl, VT: MVT::i64,
8904 N1: SINT, N2: DAG.getConstant(Val: 53, DL: dl, VT: MVT::i32));
8905 Cond = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i64,
8906 N1: Cond, N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64));
8907 Cond = DAG.getSetCC(
8908 DL: dl,
8909 VT: getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
8910 LHS: Cond, RHS: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64), Cond: ISD::SETUGT);
8911
8912 SINT = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i64, N1: Cond, N2: Round, N3: SINT);
8913 }
8914
8915 ReuseLoadInfo RLI;
8916 SDValue Bits;
8917
8918 MachineFunction &MF = DAG.getMachineFunction();
8919 if (canReuseLoadAddress(Op: SINT, MemVT: MVT::i64, RLI, DAG)) {
8920 // Drop range metadata, as this metadata becomes invalid for f64 bit
8921 // reinterpretation of i64 values.
8922 Bits = DAG.getLoad(VT: MVT::f64, dl, Chain: RLI.Chain, Ptr: RLI.Ptr, PtrInfo: RLI.MPI,
8923 Alignment: RLI.Alignment, MMOFlags: RLI.MMOFlags(), AAInfo: RLI.AAInfo, Ranges: nullptr);
8924 if (RLI.ResChain)
8925 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8926 } else if (Subtarget.hasLFIWAX() &&
8927 canReuseLoadAddress(Op: SINT, MemVT: MVT::i32, RLI, DAG, ET: ISD::SEXTLOAD)) {
8928 MachineMemOperand *MMO =
8929 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8930 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8931 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8932 Bits = DAG.getMemIntrinsicNode(Opcode: PPCISD::LFIWAX, dl,
8933 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8934 Ops, MemVT: MVT::i32, MMO);
8935 if (RLI.ResChain)
8936 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8937 } else if (Subtarget.hasFPCVT() &&
8938 canReuseLoadAddress(Op: SINT, MemVT: MVT::i32, RLI, DAG, ET: ISD::ZEXTLOAD)) {
8939 MachineMemOperand *MMO =
8940 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8941 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8942 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8943 Bits = DAG.getMemIntrinsicNode(Opcode: PPCISD::LFIWZX, dl,
8944 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8945 Ops, MemVT: MVT::i32, MMO);
8946 if (RLI.ResChain)
8947 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8948 } else if (((Subtarget.hasLFIWAX() &&
8949 SINT.getOpcode() == ISD::SIGN_EXTEND) ||
8950 (Subtarget.hasFPCVT() &&
8951 SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
8952 SINT.getOperand(i: 0).getValueType() == MVT::i32) {
8953 MachineFrameInfo &MFI = MF.getFrameInfo();
8954 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8955
8956 int FrameIdx = MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false);
8957 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8958
8959 SDValue Store = DAG.getStore(Chain, dl, Val: SINT.getOperand(i: 0), Ptr: FIdx,
8960 PtrInfo: MachinePointerInfo::getFixedStack(
8961 MF&: DAG.getMachineFunction(), FI: FrameIdx));
8962 Chain = Store;
8963
8964 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8965 "Expected an i32 store");
8966
8967 RLI.Ptr = FIdx;
8968 RLI.Chain = Chain;
8969 RLI.MPI =
8970 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx);
8971 RLI.Alignment = Align(4);
8972
8973 MachineMemOperand *MMO =
8974 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8975 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8976 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8977 Bits = DAG.getMemIntrinsicNode(Opcode: SINT.getOpcode() == ISD::ZERO_EXTEND ?
8978 PPCISD::LFIWZX : PPCISD::LFIWAX,
8979 dl, VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8980 Ops, MemVT: MVT::i32, MMO);
8981 Chain = Bits.getValue(R: 1);
8982 } else
8983 Bits = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::f64, Operand: SINT);
8984
8985 SDValue FP = convertIntToFP(Op, Src: Bits, DAG, Subtarget, Chain);
8986 if (IsStrict)
8987 Chain = FP.getValue(R: 1);
8988
8989 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8990 if (IsStrict)
8991 FP = DAG.getNode(
8992 Opcode: ISD::STRICT_FP_ROUND, DL: dl, VTList: DAG.getVTList(VT1: MVT::f32, VT2: MVT::Other),
8993 Ops: {Chain, FP, DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true)},
8994 Flags);
8995 else
8996 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
8997 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
8998 }
8999 return FP;
9000 }
9001
9002 assert(Src.getValueType() == MVT::i32 &&
9003 "Unhandled INT_TO_FP type in custom expander!");
9004 // Since we only generate this in 64-bit mode, we can take advantage of
9005 // 64-bit registers. In particular, sign extend the input value into the
9006 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
9007 // then lfd it and fcfid it.
9008 MachineFunction &MF = DAG.getMachineFunction();
9009 MachineFrameInfo &MFI = MF.getFrameInfo();
9010 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
9011
9012 SDValue Ld;
9013 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
9014 ReuseLoadInfo RLI;
9015 bool ReusingLoad;
9016 if (!(ReusingLoad = canReuseLoadAddress(Op: Src, MemVT: MVT::i32, RLI, DAG))) {
9017 int FrameIdx = MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false);
9018 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
9019
9020 SDValue Store = DAG.getStore(Chain, dl, Val: Src, Ptr: FIdx,
9021 PtrInfo: MachinePointerInfo::getFixedStack(
9022 MF&: DAG.getMachineFunction(), FI: FrameIdx));
9023 Chain = Store;
9024
9025 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
9026 "Expected an i32 store");
9027
9028 RLI.Ptr = FIdx;
9029 RLI.Chain = Chain;
9030 RLI.MPI =
9031 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx);
9032 RLI.Alignment = Align(4);
9033 }
9034
9035 MachineMemOperand *MMO =
9036 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
9037 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
9038 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
9039 Ld = DAG.getMemIntrinsicNode(Opcode: IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
9040 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other), Ops,
9041 MemVT: MVT::i32, MMO);
9042 Chain = Ld.getValue(R: 1);
9043 if (ReusingLoad && RLI.ResChain) {
9044 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Ld.getValue(R: 1));
9045 }
9046 } else {
9047 assert(Subtarget.isPPC64() &&
9048 "i32->FP without LFIWAX supported only on PPC64");
9049
9050 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9051 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
9052
9053 SDValue Ext64 = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: MVT::i64, Operand: Src);
9054
9055 // STD the extended value into the stack slot.
9056 SDValue Store = DAG.getStore(
9057 Chain, dl, Val: Ext64, Ptr: FIdx,
9058 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx));
9059 Chain = Store;
9060
9061 // Load the value as a double.
9062 Ld = DAG.getLoad(
9063 VT: MVT::f64, dl, Chain, Ptr: FIdx,
9064 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx));
9065 Chain = Ld.getValue(R: 1);
9066 }
9067
9068 // FCFID it and return it.
9069 SDValue FP = convertIntToFP(Op, Src: Ld, DAG, Subtarget, Chain);
9070 if (IsStrict)
9071 Chain = FP.getValue(R: 1);
9072 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
9073 if (IsStrict)
9074 FP = DAG.getNode(
9075 Opcode: ISD::STRICT_FP_ROUND, DL: dl, VTList: DAG.getVTList(VT1: MVT::f32, VT2: MVT::Other),
9076 Ops: {Chain, FP, DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true)}, Flags);
9077 else
9078 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
9079 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
9080 }
9081 return FP;
9082}
9083
9084SDValue PPCTargetLowering::LowerSET_ROUNDING(SDValue Op,
9085 SelectionDAG &DAG) const {
9086 SDLoc Dl(Op);
9087 MachineFunction &MF = DAG.getMachineFunction();
9088 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
9089 SDValue Chain = Op.getOperand(i: 0);
9090
9091 // If requested mode is constant, just use simpler mtfsb/mffscrni
9092 if (auto *CVal = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
9093 uint64_t Mode = CVal->getZExtValue();
9094 assert(Mode < 4 && "Unsupported rounding mode!");
9095 unsigned InternalRnd = Mode ^ (~(Mode >> 1) & 1);
9096 if (Subtarget.isISA3_0())
9097 return SDValue(
9098 DAG.getMachineNode(
9099 Opcode: PPC::MFFSCRNI, dl: Dl, ResultTys: {MVT::f64, MVT::Other},
9100 Ops: {DAG.getConstant(Val: InternalRnd, DL: Dl, VT: MVT::i32, isTarget: true), Chain}),
9101 1);
9102 SDNode *SetHi = DAG.getMachineNode(
9103 Opcode: (InternalRnd & 2) ? PPC::MTFSB1 : PPC::MTFSB0, dl: Dl, VT: MVT::Other,
9104 Ops: {DAG.getConstant(Val: 30, DL: Dl, VT: MVT::i32, isTarget: true), Chain});
9105 SDNode *SetLo = DAG.getMachineNode(
9106 Opcode: (InternalRnd & 1) ? PPC::MTFSB1 : PPC::MTFSB0, dl: Dl, VT: MVT::Other,
9107 Ops: {DAG.getConstant(Val: 31, DL: Dl, VT: MVT::i32, isTarget: true), SDValue(SetHi, 0)});
9108 return SDValue(SetLo, 0);
9109 }
9110
9111 // Use x ^ (~(x >> 1) & 1) to transform LLVM rounding mode to Power format.
9112 SDValue One = DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32);
9113 SDValue SrcFlag = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32, N1: Op.getOperand(i: 1),
9114 N2: DAG.getConstant(Val: 3, DL: Dl, VT: MVT::i32));
9115 SDValue DstFlag = DAG.getNode(
9116 Opcode: ISD::XOR, DL: Dl, VT: MVT::i32, N1: SrcFlag,
9117 N2: DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32,
9118 N1: DAG.getNOT(DL: Dl,
9119 Val: DAG.getNode(Opcode: ISD::SRL, DL: Dl, VT: MVT::i32, N1: SrcFlag, N2: One),
9120 VT: MVT::i32),
9121 N2: One));
9122 // For Power9, there's faster mffscrn, and we don't need to read FPSCR
9123 SDValue MFFS;
9124 if (!Subtarget.isISA3_0()) {
9125 MFFS = DAG.getNode(Opcode: PPCISD::MFFS, DL: Dl, ResultTys: {MVT::f64, MVT::Other}, Ops: Chain);
9126 Chain = MFFS.getValue(R: 1);
9127 }
9128 SDValue NewFPSCR;
9129 if (Subtarget.isPPC64()) {
9130 if (Subtarget.isISA3_0()) {
9131 NewFPSCR = DAG.getAnyExtOrTrunc(Op: DstFlag, DL: Dl, VT: MVT::i64);
9132 } else {
9133 // Set the last two bits (rounding mode) of bitcasted FPSCR.
9134 SDNode *InsertRN = DAG.getMachineNode(
9135 Opcode: PPC::RLDIMI, dl: Dl, VT: MVT::i64,
9136 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL: Dl, VT: MVT::i64, Operand: MFFS),
9137 DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: Dl, VT: MVT::i64, Operand: DstFlag),
9138 DAG.getTargetConstant(Val: 0, DL: Dl, VT: MVT::i32),
9139 DAG.getTargetConstant(Val: 62, DL: Dl, VT: MVT::i32)});
9140 NewFPSCR = SDValue(InsertRN, 0);
9141 }
9142 NewFPSCR = DAG.getNode(Opcode: ISD::BITCAST, DL: Dl, VT: MVT::f64, Operand: NewFPSCR);
9143 } else {
9144 // In 32-bit mode, store f64, load and update the lower half.
9145 int SSFI = MF.getFrameInfo().CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9146 SDValue StackSlot = DAG.getFrameIndex(FI: SSFI, VT: PtrVT);
9147 SDValue Addr = Subtarget.isLittleEndian()
9148 ? StackSlot
9149 : DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: StackSlot,
9150 N2: DAG.getConstant(Val: 4, DL: Dl, VT: PtrVT));
9151 if (Subtarget.isISA3_0()) {
9152 Chain = DAG.getStore(Chain, dl: Dl, Val: DstFlag, Ptr: Addr, PtrInfo: MachinePointerInfo());
9153 } else {
9154 Chain = DAG.getStore(Chain, dl: Dl, Val: MFFS, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9155 SDValue Tmp =
9156 DAG.getLoad(VT: MVT::i32, dl: Dl, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo());
9157 Chain = Tmp.getValue(R: 1);
9158 Tmp = SDValue(DAG.getMachineNode(
9159 Opcode: PPC::RLWIMI, dl: Dl, VT: MVT::i32,
9160 Ops: {Tmp, DstFlag, DAG.getTargetConstant(Val: 0, DL: Dl, VT: MVT::i32),
9161 DAG.getTargetConstant(Val: 30, DL: Dl, VT: MVT::i32),
9162 DAG.getTargetConstant(Val: 31, DL: Dl, VT: MVT::i32)}),
9163 0);
9164 Chain = DAG.getStore(Chain, dl: Dl, Val: Tmp, Ptr: Addr, PtrInfo: MachinePointerInfo());
9165 }
9166 NewFPSCR =
9167 DAG.getLoad(VT: MVT::f64, dl: Dl, Chain, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9168 Chain = NewFPSCR.getValue(R: 1);
9169 }
9170 if (Subtarget.isISA3_0())
9171 return SDValue(DAG.getMachineNode(Opcode: PPC::MFFSCRN, dl: Dl, ResultTys: {MVT::f64, MVT::Other},
9172 Ops: {NewFPSCR, Chain}),
9173 1);
9174 SDValue Zero = DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i32, isTarget: true);
9175 SDNode *MTFSF = DAG.getMachineNode(
9176 Opcode: PPC::MTFSF, dl: Dl, VT: MVT::Other,
9177 Ops: {DAG.getConstant(Val: 255, DL: Dl, VT: MVT::i32, isTarget: true), NewFPSCR, Zero, Zero, Chain});
9178 return SDValue(MTFSF, 0);
9179}
9180
9181SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op,
9182 SelectionDAG &DAG) const {
9183 SDLoc dl(Op);
9184 /*
9185 The rounding mode is in bits 30:31 of FPSR, and has the following
9186 settings:
9187 00 Round to nearest
9188 01 Round to 0
9189 10 Round to +inf
9190 11 Round to -inf
9191
9192 GET_ROUNDING, on the other hand, expects the following:
9193 -1 Undefined
9194 0 Round to 0
9195 1 Round to nearest
9196 2 Round to +inf
9197 3 Round to -inf
9198
9199 To perform the conversion, we do:
9200 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
9201 */
9202
9203 MachineFunction &MF = DAG.getMachineFunction();
9204 EVT VT = Op.getValueType();
9205 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
9206
9207 // Save FP Control Word to register
9208 SDValue Chain = Op.getOperand(i: 0);
9209 SDValue MFFS = DAG.getNode(Opcode: PPCISD::MFFS, DL: dl, ResultTys: {MVT::f64, MVT::Other}, Ops: Chain);
9210 Chain = MFFS.getValue(R: 1);
9211
9212 SDValue CWD;
9213 if (isTypeLegal(VT: MVT::i64)) {
9214 CWD = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32,
9215 Operand: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: MFFS));
9216 } else {
9217 // Save FP register to stack slot
9218 int SSFI = MF.getFrameInfo().CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9219 SDValue StackSlot = DAG.getFrameIndex(FI: SSFI, VT: PtrVT);
9220 Chain = DAG.getStore(Chain, dl, Val: MFFS, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9221
9222 // Load FP Control Word from low 32 bits of stack slot.
9223 assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
9224 "Stack slot adjustment is valid only on big endian subtargets!");
9225 SDValue Four = DAG.getConstant(Val: 4, DL: dl, VT: PtrVT);
9226 SDValue Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackSlot, N2: Four);
9227 CWD = DAG.getLoad(VT: MVT::i32, dl, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo());
9228 Chain = CWD.getValue(R: 1);
9229 }
9230
9231 // Transform as necessary
9232 SDValue CWD1 =
9233 DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32,
9234 N1: CWD, N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32));
9235 SDValue CWD2 =
9236 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32,
9237 N1: DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32,
9238 N1: DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32,
9239 N1: CWD, N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32)),
9240 N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32)),
9241 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
9242
9243 SDValue RetVal =
9244 DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32, N1: CWD1, N2: CWD2);
9245
9246 RetVal =
9247 DAG.getNode(Opcode: (VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
9248 DL: dl, VT, Operand: RetVal);
9249
9250 return DAG.getMergeValues(Ops: {RetVal, Chain}, dl);
9251}
9252
9253SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9254 EVT VT = Op.getValueType();
9255 uint64_t BitWidth = VT.getSizeInBits();
9256 SDLoc dl(Op);
9257 assert(Op.getNumOperands() == 3 &&
9258 VT == Op.getOperand(1).getValueType() &&
9259 "Unexpected SHL!");
9260
9261 // Expand into a bunch of logical ops. Note that these ops
9262 // depend on the PPC behavior for oversized shift amounts.
9263 SDValue Lo = Op.getOperand(i: 0);
9264 SDValue Hi = Op.getOperand(i: 1);
9265 SDValue Amt = Op.getOperand(i: 2);
9266 EVT AmtVT = Amt.getValueType();
9267
9268 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9269 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9270 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Amt);
9271 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Tmp1);
9272 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR , DL: dl, VT, N1: Tmp2, N2: Tmp3);
9273 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9274 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9275 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Lo, N2: Tmp5);
9276 SDValue OutHi = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp4, N2: Tmp6);
9277 SDValue OutLo = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Lo, N2: Amt);
9278 SDValue OutOps[] = { OutLo, OutHi };
9279 return DAG.getMergeValues(Ops: OutOps, dl);
9280}
9281
9282SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9283 EVT VT = Op.getValueType();
9284 SDLoc dl(Op);
9285 uint64_t BitWidth = VT.getSizeInBits();
9286 assert(Op.getNumOperands() == 3 &&
9287 VT == Op.getOperand(1).getValueType() &&
9288 "Unexpected SRL!");
9289
9290 // Expand into a bunch of logical ops. Note that these ops
9291 // depend on the PPC behavior for oversized shift amounts.
9292 SDValue Lo = Op.getOperand(i: 0);
9293 SDValue Hi = Op.getOperand(i: 1);
9294 SDValue Amt = Op.getOperand(i: 2);
9295 EVT AmtVT = Amt.getValueType();
9296
9297 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9298 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9299 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Amt);
9300 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Tmp1);
9301 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp2, N2: Tmp3);
9302 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9303 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9304 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Hi, N2: Tmp5);
9305 SDValue OutLo = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp4, N2: Tmp6);
9306 SDValue OutHi = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Hi, N2: Amt);
9307 SDValue OutOps[] = { OutLo, OutHi };
9308 return DAG.getMergeValues(Ops: OutOps, dl);
9309}
9310
9311SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
9312 SDLoc dl(Op);
9313 EVT VT = Op.getValueType();
9314 uint64_t BitWidth = VT.getSizeInBits();
9315 assert(Op.getNumOperands() == 3 &&
9316 VT == Op.getOperand(1).getValueType() &&
9317 "Unexpected SRA!");
9318
9319 // Expand into a bunch of logical ops, followed by a select_cc.
9320 SDValue Lo = Op.getOperand(i: 0);
9321 SDValue Hi = Op.getOperand(i: 1);
9322 SDValue Amt = Op.getOperand(i: 2);
9323 EVT AmtVT = Amt.getValueType();
9324
9325 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9326 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9327 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Amt);
9328 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Tmp1);
9329 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp2, N2: Tmp3);
9330 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9331 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9332 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SRA, DL: dl, VT, N1: Hi, N2: Tmp5);
9333 SDValue OutHi = DAG.getNode(Opcode: PPCISD::SRA, DL: dl, VT, N1: Hi, N2: Amt);
9334 SDValue OutLo = DAG.getSelectCC(DL: dl, LHS: Tmp5, RHS: DAG.getConstant(Val: 0, DL: dl, VT: AmtVT),
9335 True: Tmp4, False: Tmp6, Cond: ISD::SETLE);
9336 SDValue OutOps[] = { OutLo, OutHi };
9337 return DAG.getMergeValues(Ops: OutOps, dl);
9338}
9339
9340SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
9341 SelectionDAG &DAG) const {
9342 SDLoc dl(Op);
9343 EVT VT = Op.getValueType();
9344 unsigned BitWidth = VT.getSizeInBits();
9345
9346 bool IsFSHL = Op.getOpcode() == ISD::FSHL;
9347 SDValue X = Op.getOperand(i: 0);
9348 SDValue Y = Op.getOperand(i: 1);
9349 SDValue Z = Op.getOperand(i: 2);
9350 EVT AmtVT = Z.getValueType();
9351
9352 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
9353 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
9354 // This is simpler than TargetLowering::expandFunnelShift because we can rely
9355 // on PowerPC shift by BW being well defined.
9356 Z = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: AmtVT, N1: Z,
9357 N2: DAG.getConstant(Val: BitWidth - 1, DL: dl, VT: AmtVT));
9358 SDValue SubZ =
9359 DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT, N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Z);
9360 X = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: X, N2: IsFSHL ? Z : SubZ);
9361 Y = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Y, N2: IsFSHL ? SubZ : Z);
9362 return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: X, N2: Y);
9363}
9364
9365//===----------------------------------------------------------------------===//
9366// Vector related lowering.
9367//
9368
9369/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
9370/// element size of SplatSize. Cast the result to VT.
9371static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
9372 SelectionDAG &DAG, const SDLoc &dl) {
9373 static const MVT VTys[] = { // canonical VT to use for each size.
9374 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
9375 };
9376
9377 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
9378
9379 // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
9380 if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
9381 SplatSize = 1;
9382 Val = 0xFF;
9383 }
9384
9385 EVT CanonicalVT = VTys[SplatSize-1];
9386
9387 // Build a canonical splat for this value.
9388 // Explicitly truncate APInt here, as this API is used with a mix of
9389 // signed and unsigned values.
9390 return DAG.getBitcast(
9391 VT: ReqVT,
9392 V: DAG.getConstant(Val: APInt(64, Val).trunc(width: SplatSize * 8), DL: dl, VT: CanonicalVT));
9393}
9394
9395/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
9396/// specified intrinsic ID.
9397static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
9398 const SDLoc &dl, EVT DestVT = MVT::Other) {
9399 if (DestVT == MVT::Other) DestVT = Op.getValueType();
9400 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9401 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: Op);
9402}
9403
9404/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
9405/// specified intrinsic ID.
9406static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
9407 SelectionDAG &DAG, const SDLoc &dl,
9408 EVT DestVT = MVT::Other) {
9409 if (DestVT == MVT::Other) DestVT = LHS.getValueType();
9410 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9411 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: LHS, N3: RHS);
9412}
9413
9414/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
9415/// specified intrinsic ID.
9416static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
9417 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
9418 EVT DestVT = MVT::Other) {
9419 if (DestVT == MVT::Other) DestVT = Op0.getValueType();
9420 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9421 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: Op0, N3: Op1, N4: Op2);
9422}
9423
9424/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
9425/// amount. The result has the specified value type.
9426static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
9427 SelectionDAG &DAG, const SDLoc &dl) {
9428 // Force LHS/RHS to be the right type.
9429 LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: LHS);
9430 RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: RHS);
9431
9432 int Ops[16];
9433 for (unsigned i = 0; i != 16; ++i)
9434 Ops[i] = i + Amt;
9435 SDValue T = DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: LHS, N2: RHS, Mask: Ops);
9436 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: T);
9437}
9438
9439/// Do we have an efficient pattern in a .td file for this node?
9440///
9441/// \param V - pointer to the BuildVectorSDNode being matched
9442/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
9443///
9444/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
9445/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
9446/// the opposite is true (expansion is beneficial) are:
9447/// - The node builds a vector out of integers that are not 32 or 64-bits
9448/// - The node builds a vector out of constants
9449/// - The node is a "load-and-splat"
9450/// In all other cases, we will choose to keep the BUILD_VECTOR.
9451static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
9452 bool HasDirectMove,
9453 bool HasP8Vector) {
9454 EVT VecVT = V->getValueType(ResNo: 0);
9455 bool RightType = VecVT == MVT::v2f64 ||
9456 (HasP8Vector && VecVT == MVT::v4f32) ||
9457 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
9458 if (!RightType)
9459 return false;
9460
9461 bool IsSplat = true;
9462 bool IsLoad = false;
9463 SDValue Op0 = V->getOperand(Num: 0);
9464
9465 // This function is called in a block that confirms the node is not a constant
9466 // splat. So a constant BUILD_VECTOR here means the vector is built out of
9467 // different constants.
9468 if (V->isConstant())
9469 return false;
9470 for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
9471 if (V->getOperand(Num: i).isUndef())
9472 return false;
9473 // We want to expand nodes that represent load-and-splat even if the
9474 // loaded value is a floating point truncation or conversion to int.
9475 if (V->getOperand(Num: i).getOpcode() == ISD::LOAD ||
9476 (V->getOperand(Num: i).getOpcode() == ISD::FP_ROUND &&
9477 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD) ||
9478 (V->getOperand(Num: i).getOpcode() == ISD::FP_TO_SINT &&
9479 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD) ||
9480 (V->getOperand(Num: i).getOpcode() == ISD::FP_TO_UINT &&
9481 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD))
9482 IsLoad = true;
9483 // If the operands are different or the input is not a load and has more
9484 // uses than just this BV node, then it isn't a splat.
9485 if (V->getOperand(Num: i) != Op0 ||
9486 (!IsLoad && !V->isOnlyUserOf(N: V->getOperand(Num: i).getNode())))
9487 IsSplat = false;
9488 }
9489 return !(IsSplat && IsLoad);
9490}
9491
9492// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
9493SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
9494
9495 SDLoc dl(Op);
9496 SDValue Op0 = Op->getOperand(Num: 0);
9497
9498 if (!Subtarget.isPPC64() || (Op0.getOpcode() != ISD::BUILD_PAIR) ||
9499 (Op.getValueType() != MVT::f128))
9500 return SDValue();
9501
9502 SDValue Lo = Op0.getOperand(i: 0);
9503 SDValue Hi = Op0.getOperand(i: 1);
9504 if ((Lo.getValueType() != MVT::i64) || (Hi.getValueType() != MVT::i64))
9505 return SDValue();
9506
9507 if (!Subtarget.isLittleEndian())
9508 std::swap(a&: Lo, b&: Hi);
9509
9510 return DAG.getNode(Opcode: PPCISD::BUILD_FP128, DL: dl, VT: MVT::f128, N1: Lo, N2: Hi);
9511}
9512
9513static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
9514 const SDValue *InputLoad = &Op;
9515 while (InputLoad->getOpcode() == ISD::BITCAST)
9516 InputLoad = &InputLoad->getOperand(i: 0);
9517 if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9518 InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9519 IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
9520 InputLoad = &InputLoad->getOperand(i: 0);
9521 }
9522 if (InputLoad->getOpcode() != ISD::LOAD)
9523 return nullptr;
9524 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
9525 return ISD::isNormalLoad(N: LD) ? InputLoad : nullptr;
9526}
9527
9528// Convert the argument APFloat to a single precision APFloat if there is no
9529// loss in information during the conversion to single precision APFloat and the
9530// resulting number is not a denormal number. Return true if successful.
9531bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
9532 APFloat APFloatToConvert = ArgAPFloat;
9533 bool LosesInfo = true;
9534 APFloatToConvert.convert(ToSemantics: APFloat::IEEEsingle(), RM: APFloat::rmNearestTiesToEven,
9535 losesInfo: &LosesInfo);
9536 bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
9537 if (Success)
9538 ArgAPFloat = APFloatToConvert;
9539 return Success;
9540}
9541
9542// Bitcast the argument APInt to a double and convert it to a single precision
9543// APFloat, bitcast the APFloat to an APInt and assign it to the original
9544// argument if there is no loss in information during the conversion from
9545// double to single precision APFloat and the resulting number is not a denormal
9546// number. Return true if successful.
9547bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
9548 double DpValue = ArgAPInt.bitsToDouble();
9549 APFloat APFloatDp(DpValue);
9550 bool Success = convertToNonDenormSingle(ArgAPFloat&: APFloatDp);
9551 if (Success)
9552 ArgAPInt = APFloatDp.bitcastToAPInt();
9553 return Success;
9554}
9555
9556// Nondestructive check for convertTonNonDenormSingle.
9557bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
9558 // Only convert if it loses info, since XXSPLTIDP should
9559 // handle the other case.
9560 APFloat APFloatToConvert = ArgAPFloat;
9561 bool LosesInfo = true;
9562 APFloatToConvert.convert(ToSemantics: APFloat::IEEEsingle(), RM: APFloat::rmNearestTiesToEven,
9563 losesInfo: &LosesInfo);
9564
9565 return (!LosesInfo && !APFloatToConvert.isDenormal());
9566}
9567
9568static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
9569 unsigned &Opcode) {
9570 LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Val: Op.getOperand(i: 0));
9571 if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(N: InputNode))
9572 return false;
9573
9574 EVT Ty = Op->getValueType(ResNo: 0);
9575 // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
9576 // as we cannot handle extending loads for these types.
9577 if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
9578 ISD::isNON_EXTLoad(N: InputNode))
9579 return true;
9580
9581 EVT MemVT = InputNode->getMemoryVT();
9582 // For v8i16 and v16i8 types, extending loads can be handled as long as the
9583 // memory VT is the same vector element VT type.
9584 // The loads feeding into the v8i16 and v16i8 types will be extending because
9585 // scalar i8/i16 are not legal types.
9586 if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(N: InputNode) &&
9587 (MemVT == Ty.getVectorElementType()))
9588 return true;
9589
9590 if (Ty == MVT::v2i64) {
9591 // Check the extend type, when the input type is i32, and the output vector
9592 // type is v2i64.
9593 if (MemVT == MVT::i32) {
9594 if (ISD::isZEXTLoad(N: InputNode))
9595 Opcode = PPCISD::ZEXT_LD_SPLAT;
9596 if (ISD::isSEXTLoad(N: InputNode))
9597 Opcode = PPCISD::SEXT_LD_SPLAT;
9598 }
9599 return true;
9600 }
9601 return false;
9602}
9603
9604bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN,
9605 bool IsLittleEndian) {
9606 assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector");
9607
9608 BitMask.clearAllBits();
9609 EVT VT = BVN.getValueType(ResNo: 0);
9610 unsigned VTSize = VT.getSizeInBits();
9611 APInt ConstValue(VTSize, 0);
9612
9613 unsigned EltWidth = VT.getScalarSizeInBits();
9614
9615 unsigned BitPos = 0;
9616 for (auto OpVal : BVN.op_values()) {
9617 auto *CN = dyn_cast<ConstantSDNode>(Val&: OpVal);
9618
9619 if (!CN)
9620 return false;
9621 // The elements in a vector register are ordered in reverse byte order
9622 // between little-endian and big-endian modes.
9623 ConstValue.insertBits(SubBits: CN->getAPIntValue().zextOrTrunc(width: EltWidth),
9624 bitPosition: IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos);
9625 BitPos += EltWidth;
9626 }
9627
9628 for (unsigned J = 0; J < 16; ++J) {
9629 APInt ExtractValue = ConstValue.extractBits(numBits: 8, bitPosition: J * 8);
9630 if (ExtractValue != 0x00 && ExtractValue != 0xFF)
9631 return false;
9632 if (ExtractValue == 0xFF)
9633 BitMask.setBit(J);
9634 }
9635 return true;
9636}
9637
9638// If this is a case we can't handle, return null and let the default
9639// expansion code take care of it. If we CAN select this case, and if it
9640// selects to a single instruction, return Op. Otherwise, if we can codegen
9641// this case more efficiently than a constant pool load, lower it to the
9642// sequence of ops that should be used.
9643SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
9644 SelectionDAG &DAG) const {
9645 SDLoc dl(Op);
9646 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
9647 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
9648
9649 if (Subtarget.hasP10Vector()) {
9650 APInt BitMask(32, 0);
9651 // If the value of the vector is all zeros or all ones,
9652 // we do not convert it to MTVSRBMI.
9653 // The xxleqv instruction sets a vector with all ones.
9654 // The xxlxor instruction sets a vector with all zeros.
9655 if (isValidMtVsrBmi(BitMask, BVN&: *BVN, IsLittleEndian: Subtarget.isLittleEndian()) &&
9656 BitMask != 0 && BitMask != 0xffff) {
9657 SDValue SDConstant = DAG.getTargetConstant(Val: BitMask, DL: dl, VT: MVT::i32);
9658 MachineSDNode *MSDNode =
9659 DAG.getMachineNode(Opcode: PPC::MTVSRBMI, dl, VT: MVT::v16i8, Op1: SDConstant);
9660 SDValue SDV = SDValue(MSDNode, 0);
9661 EVT DVT = BVN->getValueType(ResNo: 0);
9662 EVT SVT = SDV.getValueType();
9663 if (SVT != DVT) {
9664 SDV = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: DVT, Operand: SDV);
9665 }
9666 return SDV;
9667 }
9668 // Recognize build vector patterns to emit VSX vector instructions
9669 // instead of loading value from memory.
9670 if (SDValue VecPat = combineBVLoadsSpecialValue(Operand: Op, DAG))
9671 return VecPat;
9672 }
9673 // Check if this is a splat of a constant value.
9674 APInt APSplatBits, APSplatUndef;
9675 unsigned SplatBitSize = 0;
9676 bool HasAnyUndefs;
9677 bool BVNIsConstantSplat =
9678 BVN->isConstantSplat(SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize,
9679 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian());
9680
9681 // If it is a splat of a double, check if we can shrink it to a 32 bit
9682 // non-denormal float which when converted back to double gives us the same
9683 // double. This is to exploit the XXSPLTIDP instruction.
9684 // If we lose precision, we use XXSPLTI32DX.
9685 if (BVNIsConstantSplat && (SplatBitSize == 64) &&
9686 Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
9687 // Check the type first to short-circuit so we don't modify APSplatBits if
9688 // this block isn't executed.
9689 if ((Op->getValueType(ResNo: 0) == MVT::v2f64) &&
9690 convertToNonDenormSingle(ArgAPInt&: APSplatBits)) {
9691 SDValue SplatNode = DAG.getNode(
9692 Opcode: PPCISD::XXSPLTI_SP_TO_DP, DL: dl, VT: MVT::v2f64,
9693 Operand: DAG.getTargetConstant(Val: APSplatBits.getZExtValue(), DL: dl, VT: MVT::i32));
9694 return DAG.getBitcast(VT: Op.getValueType(), V: SplatNode);
9695 } else {
9696 // We may lose precision, so we have to use XXSPLTI32DX.
9697
9698 uint32_t Hi = Hi_32(Value: APSplatBits.getZExtValue());
9699 uint32_t Lo = Lo_32(Value: APSplatBits.getZExtValue());
9700 SDValue SplatNode = DAG.getUNDEF(VT: MVT::v2i64);
9701
9702 if (!Hi || !Lo)
9703 // If either load is 0, then we should generate XXLXOR to set to 0.
9704 SplatNode = DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::v2i64);
9705
9706 if (Hi)
9707 SplatNode = DAG.getNode(
9708 Opcode: PPCISD::XXSPLTI32DX, DL: dl, VT: MVT::v2i64, N1: SplatNode,
9709 N2: DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::i32),
9710 N3: DAG.getTargetConstant(Val: Hi, DL: dl, VT: MVT::i32));
9711
9712 if (Lo)
9713 SplatNode =
9714 DAG.getNode(Opcode: PPCISD::XXSPLTI32DX, DL: dl, VT: MVT::v2i64, N1: SplatNode,
9715 N2: DAG.getTargetConstant(Val: 1, DL: dl, VT: MVT::i32),
9716 N3: DAG.getTargetConstant(Val: Lo, DL: dl, VT: MVT::i32));
9717
9718 return DAG.getBitcast(VT: Op.getValueType(), V: SplatNode);
9719 }
9720 }
9721
9722 if (SDValue V =
9723 LowerVecSplatSmallFP(Op, DAG, BVNIsConstantSplat, SplatBitSize))
9724 return V;
9725
9726 bool IsSplat64 = false;
9727 uint64_t SplatBits = 0;
9728 int32_t SextVal = 0;
9729 if (BVNIsConstantSplat && SplatBitSize <= 64) {
9730 SplatBits = APSplatBits.getZExtValue();
9731 if (SplatBitSize <= 32) {
9732 SextVal = SignExtend32(X: SplatBits, B: SplatBitSize);
9733 } else if (SplatBitSize == 64 && Subtarget.hasP8Altivec()) {
9734 int64_t Splat64Val = static_cast<int64_t>(SplatBits);
9735 bool P9Vector = Subtarget.hasP9Vector();
9736 int32_t Hi = P9Vector ? 127 : 15;
9737 int32_t Lo = P9Vector ? -128 : -16;
9738 IsSplat64 = Splat64Val >= Lo && Splat64Val <= Hi;
9739 SextVal = static_cast<int32_t>(SplatBits);
9740 }
9741 }
9742
9743 if (!BVNIsConstantSplat || (SplatBitSize > 32 && !IsSplat64)) {
9744 unsigned NewOpcode = PPCISD::LD_SPLAT;
9745
9746 // Handle load-and-splat patterns as we have instructions that will do this
9747 // in one go.
9748 if (DAG.isSplatValue(V: Op, AllowUndefs: true) &&
9749 isValidSplatLoad(Subtarget, Op, Opcode&: NewOpcode)) {
9750 const SDValue *InputLoad = &Op.getOperand(i: 0);
9751 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
9752
9753 // If the input load is an extending load, it will be an i32 -> i64
9754 // extending load and isValidSplatLoad() will update NewOpcode.
9755 unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();
9756 unsigned ElementSize =
9757 MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
9758
9759 assert(((ElementSize == 2 * MemorySize)
9760 ? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||
9761 NewOpcode == PPCISD::SEXT_LD_SPLAT)
9762 : (NewOpcode == PPCISD::LD_SPLAT)) &&
9763 "Unmatched element size and opcode!\n");
9764
9765 // Checking for a single use of this load, we have to check for vector
9766 // width (128 bits) / ElementSize uses (since each operand of the
9767 // BUILD_VECTOR is a separate use of the value.
9768 unsigned NumUsesOfInputLD = 128 / ElementSize;
9769 for (SDValue BVInOp : Op->ops())
9770 if (BVInOp.isUndef())
9771 NumUsesOfInputLD--;
9772
9773 // Exclude somes case where LD_SPLAT is worse than scalar_to_vector:
9774 // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
9775 // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
9776 // 15", but function IsValidSplatLoad() now will only return true when
9777 // the data at index 0 is not nullptr. So we will not get into trouble for
9778 // these cases.
9779 //
9780 // case 1 - lfiwzx/lfiwax
9781 // 1.1: load result is i32 and is sign/zero extend to i64;
9782 // 1.2: build a v2i64 vector type with above loaded value;
9783 // 1.3: the vector has only one value at index 0, others are all undef;
9784 // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
9785 if (NumUsesOfInputLD == 1 &&
9786 (Op->getValueType(ResNo: 0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
9787 !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
9788 Subtarget.hasLFIWAX()))
9789 return SDValue();
9790
9791 // case 2 - lxvr[hb]x
9792 // 2.1: load result is at most i16;
9793 // 2.2: build a vector with above loaded value;
9794 // 2.3: the vector has only one value at index 0, others are all undef;
9795 // 2.4: on LE target, so that lxvr[hb]x does not need any permute.
9796 if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
9797 Subtarget.isISA3_1() && ElementSize <= 16)
9798 return SDValue();
9799
9800 assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
9801 if (InputLoad->getNode()->hasNUsesOfValue(NUses: NumUsesOfInputLD, Value: 0) &&
9802 Subtarget.hasVSX()) {
9803 SDValue Ops[] = {
9804 LD->getChain(), // Chain
9805 LD->getBasePtr(), // Ptr
9806 DAG.getValueType(Op.getValueType()) // VT
9807 };
9808 SDValue LdSplt = DAG.getMemIntrinsicNode(
9809 Opcode: NewOpcode, dl, VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::Other), Ops,
9810 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
9811 // Replace all uses of the output chain of the original load with the
9812 // output chain of the new load.
9813 DAG.ReplaceAllUsesOfValueWith(From: InputLoad->getValue(R: 1),
9814 To: LdSplt.getValue(R: 1));
9815 return LdSplt;
9816 }
9817 }
9818
9819 // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
9820 // 32-bits can be lowered to VSX instructions under certain conditions.
9821 // Without VSX, there is no pattern more efficient than expanding the node.
9822 if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
9823 haveEfficientBuildVectorPattern(V: BVN, HasDirectMove: Subtarget.hasDirectMove(),
9824 HasP8Vector: Subtarget.hasP8Vector()))
9825 return Op;
9826 return SDValue();
9827 }
9828
9829 uint64_t SplatUndef = APSplatUndef.getZExtValue();
9830 unsigned SplatSize = SplatBitSize / 8;
9831
9832 // First, handle single instruction cases.
9833
9834 // All zeros?
9835 if (SplatBits == 0) {
9836 // Canonicalize all zero vectors to be v4i32.
9837 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
9838 SDValue Z = DAG.getConstant(Val: 0, DL: dl, VT: MVT::v4i32);
9839 Op = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Z);
9840 }
9841 return Op;
9842 }
9843
9844 // We have XXSPLTIW for constant splats four bytes wide.
9845 // Given vector length is a multiple of 4, 2-byte splats can be replaced
9846 // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
9847 // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
9848 // turned into a 4-byte splat of 0xABABABAB.
9849 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)
9850 return getCanonicalConstSplat(Val: SplatBits | (SplatBits << 16), SplatSize: SplatSize * 2,
9851 VT: Op.getValueType(), DAG, dl);
9852
9853 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)
9854 return getCanonicalConstSplat(Val: SplatBits, SplatSize, VT: Op.getValueType(), DAG,
9855 dl);
9856
9857 // We have XXSPLTIB for constant splats one byte wide.
9858 if (Subtarget.hasP9Vector() && SplatSize == 1)
9859 return getCanonicalConstSplat(Val: SplatBits, SplatSize, VT: Op.getValueType(), DAG,
9860 dl);
9861
9862 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
9863 // Use VSPLTIW/VUPKLSW for v2i64 in range [-16,15].
9864 if (SextVal >= -16 && SextVal <= 15) {
9865 // SplatSize may be 1, 2, 4, or 8. Use size 4 instead of 8 for the splat to
9866 // generate a splat word with extend for size 8.
9867 unsigned UseSize = SplatSize == 8 ? 4 : SplatSize;
9868 SDValue Res =
9869 getCanonicalConstSplat(Val: SextVal, SplatSize: UseSize, VT: Op.getValueType(), DAG, dl);
9870 if (SplatSize != 8)
9871 return Res;
9872 SDValue IntrinsicOp =
9873 BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vupklsw,
9874 Op: DAG.getBitcast(VT: MVT::v4i32, V: Res), DAG, dl, DestVT: MVT::v2i64);
9875 return DAG.getBitcast(VT: Op.getValueType(), V: IntrinsicOp);
9876 }
9877
9878 // Two instruction sequences.
9879
9880 if (Subtarget.hasP9Vector() && SextVal >= -128 && SextVal <= 127) {
9881 SDValue C = DAG.getConstant(Val: (unsigned char)SextVal, DL: dl, VT: MVT::i32);
9882 SmallVector<SDValue, 16> Ops(16, C);
9883 SDValue BV = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops);
9884 unsigned IID;
9885 EVT VT;
9886 switch (SplatSize) {
9887 default:
9888 llvm_unreachable("Unexpected type for vector constant.");
9889 case 2:
9890 IID = Intrinsic::ppc_altivec_vupklsb;
9891 VT = MVT::v8i16;
9892 break;
9893 case 4:
9894 IID = Intrinsic::ppc_altivec_vextsb2w;
9895 VT = MVT::v4i32;
9896 break;
9897 case 8:
9898 IID = Intrinsic::ppc_altivec_vextsb2d;
9899 VT = MVT::v2i64;
9900 break;
9901 }
9902 SDValue Extend = BuildIntrinsicOp(IID, Op: BV, DAG, dl, DestVT: VT);
9903 return DAG.getBitcast(VT: Op->getValueType(ResNo: 0), V: Extend);
9904 }
9905 assert(!IsSplat64 && "Unhandled 64-bit splat pattern");
9906
9907 // If this value is in the range [-32,30] and is even, use:
9908 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
9909 // If this value is in the range [17,31] and is odd, use:
9910 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
9911 // If this value is in the range [-31,-17] and is odd, use:
9912 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
9913 // Note the last two are three-instruction sequences.
9914 if (SextVal >= -32 && SextVal <= 31) {
9915 // To avoid having these optimizations undone by constant folding,
9916 // we convert to a pseudo that will be expanded later into one of
9917 // the above forms.
9918 SDValue Elt = DAG.getSignedConstant(Val: SextVal, DL: dl, VT: MVT::i32);
9919 EVT VT = (SplatSize == 1 ? MVT::v16i8 :
9920 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
9921 SDValue EltSize = DAG.getConstant(Val: SplatSize, DL: dl, VT: MVT::i32);
9922 SDValue RetVal = DAG.getNode(Opcode: PPCISD::VADD_SPLAT, DL: dl, VT, N1: Elt, N2: EltSize);
9923 if (VT == Op.getValueType())
9924 return RetVal;
9925 else
9926 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: RetVal);
9927 }
9928
9929 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
9930 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
9931 // for fneg/fabs.
9932 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
9933 // Make -1 and vspltisw -1:
9934 SDValue OnesV = getCanonicalConstSplat(Val: -1, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
9935
9936 // Make the VSLW intrinsic, computing 0x8000_0000.
9937 SDValue Res = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vslw, LHS: OnesV,
9938 RHS: OnesV, DAG, dl);
9939
9940 // xor by OnesV to invert it.
9941 Res = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::v4i32, N1: Res, N2: OnesV);
9942 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9943 }
9944
9945 // Check to see if this is a wide variety of vsplti*, binop self cases.
9946 static const signed char SplatCsts[] = {
9947 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
9948 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
9949 };
9950
9951 for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) {
9952 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
9953 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
9954 int i = SplatCsts[idx];
9955
9956 // Figure out what shift amount will be used by altivec if shifted by i in
9957 // this splat size.
9958 unsigned TypeShiftAmt = i & (SplatBitSize-1);
9959
9960 // vsplti + shl self.
9961 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
9962 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9963 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9964 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
9965 Intrinsic::ppc_altivec_vslw
9966 };
9967 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9968 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9969 }
9970
9971 // vsplti + srl self.
9972 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9973 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9974 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9975 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
9976 Intrinsic::ppc_altivec_vsrw
9977 };
9978 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9979 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9980 }
9981
9982 // vsplti + rol self.
9983 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
9984 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
9985 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9986 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9987 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
9988 Intrinsic::ppc_altivec_vrlw
9989 };
9990 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9991 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9992 }
9993
9994 // t = vsplti c, result = vsldoi t, t, 1
9995 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
9996 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9997 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
9998 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9999 }
10000 // t = vsplti c, result = vsldoi t, t, 2
10001 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
10002 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
10003 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
10004 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
10005 }
10006 // t = vsplti c, result = vsldoi t, t, 3
10007 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
10008 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
10009 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
10010 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
10011 }
10012 }
10013
10014 return SDValue();
10015}
10016
10017/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
10018/// the specified operations to build the shuffle.
10019static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
10020 SDValue RHS, SelectionDAG &DAG,
10021 const SDLoc &dl) {
10022 unsigned OpNum = (PFEntry >> 26) & 0x0F;
10023 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
10024 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
10025
10026 enum {
10027 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
10028 OP_VMRGHW,
10029 OP_VMRGLW,
10030 OP_VSPLTISW0,
10031 OP_VSPLTISW1,
10032 OP_VSPLTISW2,
10033 OP_VSPLTISW3,
10034 OP_VSLDOI4,
10035 OP_VSLDOI8,
10036 OP_VSLDOI12
10037 };
10038
10039 if (OpNum == OP_COPY) {
10040 if (LHSID == (1*9+2)*9+3) return LHS;
10041 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
10042 return RHS;
10043 }
10044
10045 SDValue OpLHS, OpRHS;
10046 OpLHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
10047 OpRHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
10048
10049 int ShufIdxs[16];
10050 switch (OpNum) {
10051 default: llvm_unreachable("Unknown i32 permute!");
10052 case OP_VMRGHW:
10053 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
10054 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
10055 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
10056 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
10057 break;
10058 case OP_VMRGLW:
10059 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
10060 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
10061 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
10062 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
10063 break;
10064 case OP_VSPLTISW0:
10065 for (unsigned i = 0; i != 16; ++i)
10066 ShufIdxs[i] = (i&3)+0;
10067 break;
10068 case OP_VSPLTISW1:
10069 for (unsigned i = 0; i != 16; ++i)
10070 ShufIdxs[i] = (i&3)+4;
10071 break;
10072 case OP_VSPLTISW2:
10073 for (unsigned i = 0; i != 16; ++i)
10074 ShufIdxs[i] = (i&3)+8;
10075 break;
10076 case OP_VSPLTISW3:
10077 for (unsigned i = 0; i != 16; ++i)
10078 ShufIdxs[i] = (i&3)+12;
10079 break;
10080 case OP_VSLDOI4:
10081 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 4, VT: OpLHS.getValueType(), DAG, dl);
10082 case OP_VSLDOI8:
10083 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 8, VT: OpLHS.getValueType(), DAG, dl);
10084 case OP_VSLDOI12:
10085 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 12, VT: OpLHS.getValueType(), DAG, dl);
10086 }
10087 EVT VT = OpLHS.getValueType();
10088 OpLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OpLHS);
10089 OpRHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OpRHS);
10090 SDValue T = DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: OpLHS, N2: OpRHS, Mask: ShufIdxs);
10091 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: T);
10092}
10093
10094/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
10095/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
10096/// SDValue.
10097SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
10098 SelectionDAG &DAG) const {
10099 const unsigned BytesInVector = 16;
10100 bool IsLE = Subtarget.isLittleEndian();
10101 SDLoc dl(N);
10102 SDValue V1 = N->getOperand(Num: 0);
10103 SDValue V2 = N->getOperand(Num: 1);
10104 unsigned ShiftElts = 0, InsertAtByte = 0;
10105 bool Swap = false;
10106
10107 // Shifts required to get the byte we want at element 7.
10108 unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
10109 0, 15, 14, 13, 12, 11, 10, 9};
10110 unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
10111 1, 2, 3, 4, 5, 6, 7, 8};
10112
10113 ArrayRef<int> Mask = N->getMask();
10114 int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
10115
10116 // For each mask element, find out if we're just inserting something
10117 // from V2 into V1 or vice versa.
10118 // Possible permutations inserting an element from V2 into V1:
10119 // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
10120 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
10121 // ...
10122 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
10123 // Inserting from V1 into V2 will be similar, except mask range will be
10124 // [16,31].
10125
10126 bool FoundCandidate = false;
10127 // If both vector operands for the shuffle are the same vector, the mask
10128 // will contain only elements from the first one and the second one will be
10129 // undef.
10130 unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
10131 // Go through the mask of half-words to find an element that's being moved
10132 // from one vector to the other.
10133 for (unsigned i = 0; i < BytesInVector; ++i) {
10134 unsigned CurrentElement = Mask[i];
10135 // If 2nd operand is undefined, we should only look for element 7 in the
10136 // Mask.
10137 if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
10138 continue;
10139
10140 bool OtherElementsInOrder = true;
10141 // Examine the other elements in the Mask to see if they're in original
10142 // order.
10143 for (unsigned j = 0; j < BytesInVector; ++j) {
10144 if (j == i)
10145 continue;
10146 // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
10147 // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
10148 // in which we always assume we're always picking from the 1st operand.
10149 int MaskOffset =
10150 (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
10151 if (Mask[j] != OriginalOrder[j] + MaskOffset) {
10152 OtherElementsInOrder = false;
10153 break;
10154 }
10155 }
10156 // If other elements are in original order, we record the number of shifts
10157 // we need to get the element we want into element 7. Also record which byte
10158 // in the vector we should insert into.
10159 if (OtherElementsInOrder) {
10160 // If 2nd operand is undefined, we assume no shifts and no swapping.
10161 if (V2.isUndef()) {
10162 ShiftElts = 0;
10163 Swap = false;
10164 } else {
10165 // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
10166 ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
10167 : BigEndianShifts[CurrentElement & 0xF];
10168 Swap = CurrentElement < BytesInVector;
10169 }
10170 InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
10171 FoundCandidate = true;
10172 break;
10173 }
10174 }
10175
10176 if (!FoundCandidate)
10177 return SDValue();
10178
10179 // Candidate found, construct the proper SDAG sequence with VINSERTB,
10180 // optionally with VECSHL if shift is required.
10181 if (Swap)
10182 std::swap(a&: V1, b&: V2);
10183 if (V2.isUndef())
10184 V2 = V1;
10185 if (ShiftElts) {
10186 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v16i8, N1: V2, N2: V2,
10187 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10188 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v16i8, N1: V1, N2: Shl,
10189 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10190 }
10191 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v16i8, N1: V1, N2: V2,
10192 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10193}
10194
10195/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
10196/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
10197/// SDValue.
10198SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
10199 SelectionDAG &DAG) const {
10200 const unsigned NumHalfWords = 8;
10201 const unsigned BytesInVector = NumHalfWords * 2;
10202 // Check that the shuffle is on half-words.
10203 if (!isNByteElemShuffleMask(N, Width: 2, StepLen: 1))
10204 return SDValue();
10205
10206 bool IsLE = Subtarget.isLittleEndian();
10207 SDLoc dl(N);
10208 SDValue V1 = N->getOperand(Num: 0);
10209 SDValue V2 = N->getOperand(Num: 1);
10210 unsigned ShiftElts = 0, InsertAtByte = 0;
10211 bool Swap = false;
10212
10213 // Shifts required to get the half-word we want at element 3.
10214 unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
10215 unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
10216
10217 uint32_t Mask = 0;
10218 uint32_t OriginalOrderLow = 0x1234567;
10219 uint32_t OriginalOrderHigh = 0x89ABCDEF;
10220 // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
10221 // 32-bit space, only need 4-bit nibbles per element.
10222 for (unsigned i = 0; i < NumHalfWords; ++i) {
10223 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
10224 Mask |= ((uint32_t)(N->getMaskElt(Idx: i * 2) / 2) << MaskShift);
10225 }
10226
10227 // For each mask element, find out if we're just inserting something
10228 // from V2 into V1 or vice versa. Possible permutations inserting an element
10229 // from V2 into V1:
10230 // X, 1, 2, 3, 4, 5, 6, 7
10231 // 0, X, 2, 3, 4, 5, 6, 7
10232 // 0, 1, X, 3, 4, 5, 6, 7
10233 // 0, 1, 2, X, 4, 5, 6, 7
10234 // 0, 1, 2, 3, X, 5, 6, 7
10235 // 0, 1, 2, 3, 4, X, 6, 7
10236 // 0, 1, 2, 3, 4, 5, X, 7
10237 // 0, 1, 2, 3, 4, 5, 6, X
10238 // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
10239
10240 bool FoundCandidate = false;
10241 // Go through the mask of half-words to find an element that's being moved
10242 // from one vector to the other.
10243 for (unsigned i = 0; i < NumHalfWords; ++i) {
10244 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
10245 uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
10246 uint32_t MaskOtherElts = ~(0xF << MaskShift);
10247 uint32_t TargetOrder = 0x0;
10248
10249 // If both vector operands for the shuffle are the same vector, the mask
10250 // will contain only elements from the first one and the second one will be
10251 // undef.
10252 if (V2.isUndef()) {
10253 ShiftElts = 0;
10254 unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
10255 TargetOrder = OriginalOrderLow;
10256 Swap = false;
10257 // Skip if not the correct element or mask of other elements don't equal
10258 // to our expected order.
10259 if (MaskOneElt == VINSERTHSrcElem &&
10260 (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10261 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10262 FoundCandidate = true;
10263 break;
10264 }
10265 } else { // If both operands are defined.
10266 // Target order is [8,15] if the current mask is between [0,7].
10267 TargetOrder =
10268 (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
10269 // Skip if mask of other elements don't equal our expected order.
10270 if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10271 // We only need the last 3 bits for the number of shifts.
10272 ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
10273 : BigEndianShifts[MaskOneElt & 0x7];
10274 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10275 Swap = MaskOneElt < NumHalfWords;
10276 FoundCandidate = true;
10277 break;
10278 }
10279 }
10280 }
10281
10282 if (!FoundCandidate)
10283 return SDValue();
10284
10285 // Candidate found, construct the proper SDAG sequence with VINSERTH,
10286 // optionally with VECSHL if shift is required.
10287 if (Swap)
10288 std::swap(a&: V1, b&: V2);
10289 if (V2.isUndef())
10290 V2 = V1;
10291 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V1);
10292 if (ShiftElts) {
10293 // Double ShiftElts because we're left shifting on v16i8 type.
10294 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v16i8, N1: V2, N2: V2,
10295 N3: DAG.getConstant(Val: 2 * ShiftElts, DL: dl, VT: MVT::i32));
10296 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: Shl);
10297 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v8i16, N1: Conv1, N2: Conv2,
10298 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10299 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10300 }
10301 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V2);
10302 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v8i16, N1: Conv1, N2: Conv2,
10303 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10304 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10305}
10306
10307/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
10308/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
10309/// return the default SDValue.
10310SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
10311 SelectionDAG &DAG) const {
10312 // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
10313 // to v16i8. Peek through the bitcasts to get the actual operands.
10314 SDValue LHS = peekThroughBitcasts(V: SVN->getOperand(Num: 0));
10315 SDValue RHS = peekThroughBitcasts(V: SVN->getOperand(Num: 1));
10316
10317 auto ShuffleMask = SVN->getMask();
10318 SDValue VecShuffle(SVN, 0);
10319 SDLoc DL(SVN);
10320
10321 // Check that we have a four byte shuffle.
10322 if (!isNByteElemShuffleMask(N: SVN, Width: 4, StepLen: 1))
10323 return SDValue();
10324
10325 // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
10326 if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
10327 std::swap(a&: LHS, b&: RHS);
10328 VecShuffle = peekThroughBitcasts(V: DAG.getCommutedVectorShuffle(SV: *SVN));
10329 ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(Val&: VecShuffle);
10330 if (!CommutedSV)
10331 return SDValue();
10332 ShuffleMask = CommutedSV->getMask();
10333 }
10334
10335 // Ensure that the RHS is a vector of constants.
10336 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
10337 if (!BVN)
10338 return SDValue();
10339
10340 // Check if RHS is a splat of 4-bytes (or smaller).
10341 APInt APSplatValue, APSplatUndef;
10342 unsigned SplatBitSize;
10343 bool HasAnyUndefs;
10344 if (!BVN->isConstantSplat(SplatValue&: APSplatValue, SplatUndef&: APSplatUndef, SplatBitSize,
10345 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian()) ||
10346 SplatBitSize > 32)
10347 return SDValue();
10348
10349 // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
10350 // The instruction splats a constant C into two words of the source vector
10351 // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
10352 // Thus we check that the shuffle mask is the equivalent of
10353 // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
10354 // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
10355 // within each word are consecutive, so we only need to check the first byte.
10356 SDValue Index;
10357 bool IsLE = Subtarget.isLittleEndian();
10358 if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
10359 (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
10360 ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
10361 Index = DAG.getTargetConstant(Val: IsLE ? 0 : 1, DL, VT: MVT::i32);
10362 else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
10363 (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
10364 ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
10365 Index = DAG.getTargetConstant(Val: IsLE ? 1 : 0, DL, VT: MVT::i32);
10366 else
10367 return SDValue();
10368
10369 // If the splat is narrower than 32-bits, we need to get the 32-bit value
10370 // for XXSPLTI32DX.
10371 unsigned SplatVal = APSplatValue.getZExtValue();
10372 for (; SplatBitSize < 32; SplatBitSize <<= 1)
10373 SplatVal |= (SplatVal << SplatBitSize);
10374
10375 SDValue SplatNode = DAG.getNode(
10376 Opcode: PPCISD::XXSPLTI32DX, DL, VT: MVT::v2i64, N1: DAG.getBitcast(VT: MVT::v2i64, V: LHS),
10377 N2: Index, N3: DAG.getTargetConstant(Val: SplatVal, DL, VT: MVT::i32));
10378 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: SplatNode);
10379}
10380
10381/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
10382/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
10383/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
10384/// i.e (or (shl x, C1), (srl x, 128-C1)).
10385SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
10386 assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
10387 assert(Op.getValueType() == MVT::v1i128 &&
10388 "Only set v1i128 as custom, other type shouldn't reach here!");
10389 SDLoc dl(Op);
10390 SDValue N0 = peekThroughBitcasts(V: Op.getOperand(i: 0));
10391 SDValue N1 = peekThroughBitcasts(V: Op.getOperand(i: 1));
10392 unsigned SHLAmt = N1.getConstantOperandVal(i: 0);
10393 if (SHLAmt % 8 == 0) {
10394 std::array<int, 16> Mask;
10395 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
10396 std::rotate(first: Mask.begin(), middle: Mask.begin() + SHLAmt / 8, last: Mask.end());
10397 if (SDValue Shuffle =
10398 DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: DAG.getBitcast(VT: MVT::v16i8, V: N0),
10399 N2: DAG.getUNDEF(VT: MVT::v16i8), Mask))
10400 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: Shuffle);
10401 }
10402 SDValue ArgVal = DAG.getBitcast(VT: MVT::i128, V: N0);
10403 SDValue SHLOp = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i128, N1: ArgVal,
10404 N2: DAG.getConstant(Val: SHLAmt, DL: dl, VT: MVT::i32));
10405 SDValue SRLOp = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i128, N1: ArgVal,
10406 N2: DAG.getConstant(Val: 128 - SHLAmt, DL: dl, VT: MVT::i32));
10407 SDValue OROp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MVT::i128, N1: SHLOp, N2: SRLOp);
10408 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: OROp);
10409}
10410
10411/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
10412/// is a shuffle we can handle in a single instruction, return it. Otherwise,
10413/// return the code it can be lowered into. Worst case, it can always be
10414/// lowered into a vperm.
10415SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
10416 SelectionDAG &DAG) const {
10417 SDLoc dl(Op);
10418 SDValue V1 = Op.getOperand(i: 0);
10419 SDValue V2 = Op.getOperand(i: 1);
10420 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val&: Op);
10421
10422 // Any nodes that were combined in the target-independent combiner prior
10423 // to vector legalization will not be sent to the target combine. Try to
10424 // combine it here.
10425 if (SDValue NewShuffle = combineVectorShuffle(SVN: SVOp, DAG)) {
10426 if (!isa<ShuffleVectorSDNode>(Val: NewShuffle))
10427 return NewShuffle;
10428 Op = NewShuffle;
10429 SVOp = cast<ShuffleVectorSDNode>(Val&: Op);
10430 V1 = Op.getOperand(i: 0);
10431 V2 = Op.getOperand(i: 1);
10432 }
10433 EVT VT = Op.getValueType();
10434 bool isLittleEndian = Subtarget.isLittleEndian();
10435
10436 unsigned ShiftElts, InsertAtByte;
10437 bool Swap = false;
10438
10439 // If this is a load-and-splat, we can do that with a single instruction
10440 // in some cases. However if the load has multiple uses, we don't want to
10441 // combine it because that will just produce multiple loads.
10442 bool IsPermutedLoad = false;
10443 const SDValue *InputLoad = getNormalLoadInput(Op: V1, IsPermuted&: IsPermutedLoad);
10444 if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
10445 (PPC::isSplatShuffleMask(N: SVOp, EltSize: 4) || PPC::isSplatShuffleMask(N: SVOp, EltSize: 8)) &&
10446 InputLoad->hasOneUse()) {
10447 bool IsFourByte = PPC::isSplatShuffleMask(N: SVOp, EltSize: 4);
10448 int SplatIdx =
10449 PPC::getSplatIdxForPPCMnemonics(N: SVOp, EltSize: IsFourByte ? 4 : 8, DAG);
10450
10451 // The splat index for permuted loads will be in the left half of the vector
10452 // which is strictly wider than the loaded value by 8 bytes. So we need to
10453 // adjust the splat index to point to the correct address in memory.
10454 if (IsPermutedLoad) {
10455 assert((isLittleEndian || IsFourByte) &&
10456 "Unexpected size for permuted load on big endian target");
10457 SplatIdx += IsFourByte ? 2 : 1;
10458 assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
10459 "Splat of a value outside of the loaded memory");
10460 }
10461
10462 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
10463 // For 4-byte load-and-splat, we need Power9.
10464 if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
10465 uint64_t Offset = 0;
10466 if (IsFourByte)
10467 Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
10468 else
10469 Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
10470
10471 // If the width of the load is the same as the width of the splat,
10472 // loading with an offset would load the wrong memory.
10473 if (LD->getValueType(ResNo: 0).getSizeInBits() == (IsFourByte ? 32 : 64))
10474 Offset = 0;
10475
10476 SDValue BasePtr = LD->getBasePtr();
10477 if (Offset != 0)
10478 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
10479 N1: BasePtr, N2: DAG.getIntPtrConstant(Val: Offset, DL: dl));
10480 SDValue Ops[] = {
10481 LD->getChain(), // Chain
10482 BasePtr, // BasePtr
10483 DAG.getValueType(Op.getValueType()) // VT
10484 };
10485 SDVTList VTL =
10486 DAG.getVTList(VT1: IsFourByte ? MVT::v4i32 : MVT::v2i64, VT2: MVT::Other);
10487 SDValue LdSplt =
10488 DAG.getMemIntrinsicNode(Opcode: PPCISD::LD_SPLAT, dl, VTList: VTL,
10489 Ops, MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
10490 DAG.ReplaceAllUsesOfValueWith(From: InputLoad->getValue(R: 1), To: LdSplt.getValue(R: 1));
10491 if (LdSplt.getValueType() != SVOp->getValueType(ResNo: 0))
10492 LdSplt = DAG.getBitcast(VT: SVOp->getValueType(ResNo: 0), V: LdSplt);
10493 return LdSplt;
10494 }
10495 }
10496
10497 // All v2i64 and v2f64 shuffles are legal
10498 if (VT == MVT::v2i64 || VT == MVT::v2f64)
10499 return Op;
10500
10501 if (Subtarget.hasP9Vector() &&
10502 PPC::isXXINSERTWMask(N: SVOp, ShiftElts, InsertAtByte, Swap,
10503 IsLE: isLittleEndian)) {
10504 if (V2.isUndef())
10505 V2 = V1;
10506 else if (Swap)
10507 std::swap(a&: V1, b&: V2);
10508 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10509 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V2);
10510 if (ShiftElts) {
10511 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v4i32, N1: Conv2, N2: Conv2,
10512 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10513 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Shl,
10514 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10515 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10516 }
10517 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Conv2,
10518 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10519 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10520 }
10521
10522 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
10523 SDValue SplatInsertNode;
10524 if ((SplatInsertNode = lowerToXXSPLTI32DX(SVN: SVOp, DAG)))
10525 return SplatInsertNode;
10526 }
10527
10528 if (Subtarget.hasP9Altivec()) {
10529 SDValue NewISDNode;
10530 if ((NewISDNode = lowerToVINSERTH(N: SVOp, DAG)))
10531 return NewISDNode;
10532
10533 if ((NewISDNode = lowerToVINSERTB(N: SVOp, DAG)))
10534 return NewISDNode;
10535 }
10536
10537 if (Subtarget.hasVSX() &&
10538 PPC::isXXSLDWIShuffleMask(N: SVOp, ShiftElts, Swap, IsLE: isLittleEndian)) {
10539 if (Swap)
10540 std::swap(a&: V1, b&: V2);
10541 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10542 SDValue Conv2 =
10543 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V2.isUndef() ? V1 : V2);
10544
10545 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Conv2,
10546 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10547 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Shl);
10548 }
10549
10550 if (Subtarget.hasVSX() &&
10551 PPC::isXXPERMDIShuffleMask(N: SVOp, DM&: ShiftElts, Swap, IsLE: isLittleEndian)) {
10552 if (Swap)
10553 std::swap(a&: V1, b&: V2);
10554 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V1);
10555 SDValue Conv2 =
10556 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V2.isUndef() ? V1 : V2);
10557
10558 SDValue PermDI = DAG.getNode(Opcode: PPCISD::XXPERMDI, DL: dl, VT: MVT::v2i64, N1: Conv1, N2: Conv2,
10559 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10560 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: PermDI);
10561 }
10562
10563 if (Subtarget.hasP9Vector()) {
10564 if (PPC::isXXBRHShuffleMask(N: SVOp)) {
10565 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V1);
10566 SDValue ReveHWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v8i16, Operand: Conv);
10567 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveHWord);
10568 } else if (PPC::isXXBRWShuffleMask(N: SVOp)) {
10569 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10570 SDValue ReveWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v4i32, Operand: Conv);
10571 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveWord);
10572 } else if (PPC::isXXBRDShuffleMask(N: SVOp)) {
10573 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V1);
10574 SDValue ReveDWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v2i64, Operand: Conv);
10575 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveDWord);
10576 } else if (PPC::isXXBRQShuffleMask(N: SVOp)) {
10577 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: V1);
10578 SDValue ReveQWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v1i128, Operand: Conv);
10579 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveQWord);
10580 }
10581 }
10582
10583 if (Subtarget.hasVSX()) {
10584 if (V2.isUndef() && PPC::isSplatShuffleMask(N: SVOp, EltSize: 4)) {
10585 int SplatIdx = PPC::getSplatIdxForPPCMnemonics(N: SVOp, EltSize: 4, DAG);
10586
10587 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10588 SDValue Splat = DAG.getNode(Opcode: PPCISD::XXSPLT, DL: dl, VT: MVT::v4i32, N1: Conv,
10589 N2: DAG.getConstant(Val: SplatIdx, DL: dl, VT: MVT::i32));
10590 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Splat);
10591 }
10592
10593 // Left shifts of 8 bytes are actually swaps. Convert accordingly.
10594 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind: 1, DAG) == 8) {
10595 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2f64, Operand: V1);
10596 SDValue Swap = DAG.getNode(Opcode: PPCISD::SWAP_NO_CHAIN, DL: dl, VT: MVT::v2f64, Operand: Conv);
10597 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Swap);
10598 }
10599 }
10600
10601 // Cases that are handled by instructions that take permute immediates
10602 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
10603 // selected by the instruction selector.
10604 if (V2.isUndef()) {
10605 if (PPC::isSplatShuffleMask(N: SVOp, EltSize: 1) ||
10606 PPC::isSplatShuffleMask(N: SVOp, EltSize: 2) ||
10607 PPC::isSplatShuffleMask(N: SVOp, EltSize: 4) ||
10608 PPC::isVPKUWUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10609 PPC::isVPKUHUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10610 PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind: 1, DAG) != -1 ||
10611 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind: 1, DAG) ||
10612 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind: 1, DAG) ||
10613 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind: 1, DAG) ||
10614 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind: 1, DAG) ||
10615 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind: 1, DAG) ||
10616 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind: 1, DAG) ||
10617 (Subtarget.hasP8Altivec() && (
10618 PPC::isVPKUDUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10619 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: true, ShuffleKind: 1, DAG) ||
10620 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: false, ShuffleKind: 1, DAG)))) {
10621 return Op;
10622 }
10623 }
10624
10625 // Altivec has a variety of "shuffle immediates" that take two vector inputs
10626 // and produce a fixed permutation. If any of these match, do not lower to
10627 // VPERM.
10628 unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
10629 if (PPC::isVPKUWUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10630 PPC::isVPKUHUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10631 PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind, DAG) != -1 ||
10632 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind, DAG) ||
10633 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind, DAG) ||
10634 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind, DAG) ||
10635 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind, DAG) ||
10636 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind, DAG) ||
10637 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind, DAG) ||
10638 (Subtarget.hasP8Altivec() && (
10639 PPC::isVPKUDUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10640 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: true, ShuffleKind, DAG) ||
10641 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: false, ShuffleKind, DAG))))
10642 return Op;
10643
10644 // Check to see if this is a shuffle of 4-byte values. If so, we can use our
10645 // perfect shuffle table to emit an optimal matching sequence.
10646 ArrayRef<int> PermMask = SVOp->getMask();
10647
10648 if (!DisablePerfectShuffle && !isLittleEndian) {
10649 unsigned PFIndexes[4];
10650 bool isFourElementShuffle = true;
10651 for (unsigned i = 0; i != 4 && isFourElementShuffle;
10652 ++i) { // Element number
10653 unsigned EltNo = 8; // Start out undef.
10654 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
10655 if (PermMask[i * 4 + j] < 0)
10656 continue; // Undef, ignore it.
10657
10658 unsigned ByteSource = PermMask[i * 4 + j];
10659 if ((ByteSource & 3) != j) {
10660 isFourElementShuffle = false;
10661 break;
10662 }
10663
10664 if (EltNo == 8) {
10665 EltNo = ByteSource / 4;
10666 } else if (EltNo != ByteSource / 4) {
10667 isFourElementShuffle = false;
10668 break;
10669 }
10670 }
10671 PFIndexes[i] = EltNo;
10672 }
10673
10674 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
10675 // perfect shuffle vector to determine if it is cost effective to do this as
10676 // discrete instructions, or whether we should use a vperm.
10677 // For now, we skip this for little endian until such time as we have a
10678 // little-endian perfect shuffle table.
10679 if (isFourElementShuffle) {
10680 // Compute the index in the perfect shuffle table.
10681 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10682 PFIndexes[2] * 9 + PFIndexes[3];
10683
10684 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10685 unsigned Cost = (PFEntry >> 30);
10686
10687 // Determining when to avoid vperm is tricky. Many things affect the cost
10688 // of vperm, particularly how many times the perm mask needs to be
10689 // computed. For example, if the perm mask can be hoisted out of a loop or
10690 // is already used (perhaps because there are multiple permutes with the
10691 // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the
10692 // permute mask out of the loop requires an extra register.
10693 //
10694 // As a compromise, we only emit discrete instructions if the shuffle can
10695 // be generated in 3 or fewer operations. When we have loop information
10696 // available, if this block is within a loop, we should avoid using vperm
10697 // for 3-operation perms and use a constant pool load instead.
10698 if (Cost < 3)
10699 return GeneratePerfectShuffle(PFEntry, LHS: V1, RHS: V2, DAG, dl);
10700 }
10701 }
10702
10703 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
10704 // vector that will get spilled to the constant pool.
10705 if (V2.isUndef()) V2 = V1;
10706
10707 return LowerVPERM(Op, DAG, PermMask, VT, V1, V2);
10708}
10709
10710SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
10711 ArrayRef<int> PermMask, EVT VT,
10712 SDValue V1, SDValue V2) const {
10713 unsigned Opcode = PPCISD::VPERM;
10714 EVT ValType = V1.getValueType();
10715 SDLoc dl(Op);
10716 bool NeedSwap = false;
10717 bool isLittleEndian = Subtarget.isLittleEndian();
10718 bool isPPC64 = Subtarget.isPPC64();
10719
10720 if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
10721 (V1->hasOneUse() || V2->hasOneUse())) {
10722 LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
10723 "XXPERM instead\n");
10724 Opcode = PPCISD::XXPERM;
10725
10726 // The second input to XXPERM is also an output so if the second input has
10727 // multiple uses then copying is necessary, as a result we want the
10728 // single-use operand to be used as the second input to prevent copying.
10729 if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
10730 (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
10731 std::swap(a&: V1, b&: V2);
10732 NeedSwap = !NeedSwap;
10733 }
10734 }
10735
10736 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
10737 // that it is in input element units, not in bytes. Convert now.
10738
10739 // For little endian, the order of the input vectors is reversed, and
10740 // the permutation mask is complemented with respect to 31. This is
10741 // necessary to produce proper semantics with the big-endian-based vperm
10742 // instruction.
10743 EVT EltVT = V1.getValueType().getVectorElementType();
10744 unsigned BytesPerElement = EltVT.getSizeInBits() / 8;
10745
10746 bool V1HasXXSWAPD = V1->getOperand(Num: 0)->getOpcode() == PPCISD::XXSWAPD;
10747 bool V2HasXXSWAPD = V2->getOperand(Num: 0)->getOpcode() == PPCISD::XXSWAPD;
10748
10749 /*
10750 Vectors will be appended like so: [ V1 | v2 ]
10751 XXSWAPD on V1:
10752 [ A | B | C | D ] -> [ C | D | A | B ]
10753 0-3 4-7 8-11 12-15 0-3 4-7 8-11 12-15
10754 i.e. index of A, B += 8, and index of C, D -= 8.
10755 XXSWAPD on V2:
10756 [ E | F | G | H ] -> [ G | H | E | F ]
10757 16-19 20-23 24-27 28-31 16-19 20-23 24-27 28-31
10758 i.e. index of E, F += 8, index of G, H -= 8
10759 Swap V1 and V2:
10760 [ V1 | V2 ] -> [ V2 | V1 ]
10761 0-15 16-31 0-15 16-31
10762 i.e. index of V1 += 16, index of V2 -= 16
10763 */
10764
10765 SmallVector<SDValue, 16> ResultMask;
10766 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
10767 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
10768
10769 if (V1HasXXSWAPD) {
10770 if (SrcElt < 8)
10771 SrcElt += 8;
10772 else if (SrcElt < 16)
10773 SrcElt -= 8;
10774 }
10775 if (V2HasXXSWAPD) {
10776 if (SrcElt > 23)
10777 SrcElt -= 8;
10778 else if (SrcElt > 15)
10779 SrcElt += 8;
10780 }
10781 if (NeedSwap) {
10782 if (SrcElt < 16)
10783 SrcElt += 16;
10784 else
10785 SrcElt -= 16;
10786 }
10787 for (unsigned j = 0; j != BytesPerElement; ++j)
10788 if (isLittleEndian)
10789 ResultMask.push_back(
10790 Elt: DAG.getConstant(Val: 31 - (SrcElt * BytesPerElement + j), DL: dl, VT: MVT::i32));
10791 else
10792 ResultMask.push_back(
10793 Elt: DAG.getConstant(Val: SrcElt * BytesPerElement + j, DL: dl, VT: MVT::i32));
10794 }
10795
10796 if (V1HasXXSWAPD) {
10797 dl = SDLoc(V1->getOperand(Num: 0));
10798 V1 = V1->getOperand(Num: 0)->getOperand(Num: 1);
10799 }
10800 if (V2HasXXSWAPD) {
10801 dl = SDLoc(V2->getOperand(Num: 0));
10802 V2 = V2->getOperand(Num: 0)->getOperand(Num: 1);
10803 }
10804
10805 if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10806 if (ValType != MVT::v2f64)
10807 V1 = DAG.getBitcast(VT: MVT::v2f64, V: V1);
10808 if (V2.getValueType() != MVT::v2f64)
10809 V2 = DAG.getBitcast(VT: MVT::v2f64, V: V2);
10810 }
10811
10812 ShufflesHandledWithVPERM++;
10813 SDValue VPermMask = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops: ResultMask);
10814 LLVM_DEBUG({
10815 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10816 if (Opcode == PPCISD::XXPERM) {
10817 dbgs() << "Emitting a XXPERM for the following shuffle:\n";
10818 } else {
10819 dbgs() << "Emitting a VPERM for the following shuffle:\n";
10820 }
10821 SVOp->dump();
10822 dbgs() << "With the following permute control vector:\n";
10823 VPermMask.dump();
10824 });
10825
10826 if (Opcode == PPCISD::XXPERM)
10827 VPermMask = DAG.getBitcast(VT: MVT::v4i32, V: VPermMask);
10828
10829 // Only need to place items backwards in LE,
10830 // the mask was properly calculated.
10831 if (isLittleEndian)
10832 std::swap(a&: V1, b&: V2);
10833
10834 SDValue VPERMNode =
10835 DAG.getNode(Opcode, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2, N3: VPermMask);
10836
10837 VPERMNode = DAG.getBitcast(VT: ValType, V: VPERMNode);
10838 return VPERMNode;
10839}
10840
10841/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
10842/// vector comparison. If it is, return true and fill in Opc/isDot with
10843/// information about the intrinsic.
10844static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
10845 bool &isDot, const PPCSubtarget &Subtarget) {
10846 unsigned IntrinsicID = Intrin.getConstantOperandVal(i: 0);
10847 CompareOpc = -1;
10848 isDot = false;
10849 switch (IntrinsicID) {
10850 default:
10851 return false;
10852 // Comparison predicates.
10853 case Intrinsic::ppc_altivec_vcmpbfp_p:
10854 CompareOpc = 966;
10855 isDot = true;
10856 break;
10857 case Intrinsic::ppc_altivec_vcmpeqfp_p:
10858 CompareOpc = 198;
10859 isDot = true;
10860 break;
10861 case Intrinsic::ppc_altivec_vcmpequb_p:
10862 CompareOpc = 6;
10863 isDot = true;
10864 break;
10865 case Intrinsic::ppc_altivec_vcmpequh_p:
10866 CompareOpc = 70;
10867 isDot = true;
10868 break;
10869 case Intrinsic::ppc_altivec_vcmpequw_p:
10870 CompareOpc = 134;
10871 isDot = true;
10872 break;
10873 case Intrinsic::ppc_altivec_vcmpequd_p:
10874 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10875 CompareOpc = 199;
10876 isDot = true;
10877 } else
10878 return false;
10879 break;
10880 case Intrinsic::ppc_altivec_vcmpneb_p:
10881 case Intrinsic::ppc_altivec_vcmpneh_p:
10882 case Intrinsic::ppc_altivec_vcmpnew_p:
10883 case Intrinsic::ppc_altivec_vcmpnezb_p:
10884 case Intrinsic::ppc_altivec_vcmpnezh_p:
10885 case Intrinsic::ppc_altivec_vcmpnezw_p:
10886 if (Subtarget.hasP9Altivec()) {
10887 switch (IntrinsicID) {
10888 default:
10889 llvm_unreachable("Unknown comparison intrinsic.");
10890 case Intrinsic::ppc_altivec_vcmpneb_p:
10891 CompareOpc = 7;
10892 break;
10893 case Intrinsic::ppc_altivec_vcmpneh_p:
10894 CompareOpc = 71;
10895 break;
10896 case Intrinsic::ppc_altivec_vcmpnew_p:
10897 CompareOpc = 135;
10898 break;
10899 case Intrinsic::ppc_altivec_vcmpnezb_p:
10900 CompareOpc = 263;
10901 break;
10902 case Intrinsic::ppc_altivec_vcmpnezh_p:
10903 CompareOpc = 327;
10904 break;
10905 case Intrinsic::ppc_altivec_vcmpnezw_p:
10906 CompareOpc = 391;
10907 break;
10908 }
10909 isDot = true;
10910 } else
10911 return false;
10912 break;
10913 case Intrinsic::ppc_altivec_vcmpgefp_p:
10914 CompareOpc = 454;
10915 isDot = true;
10916 break;
10917 case Intrinsic::ppc_altivec_vcmpgtfp_p:
10918 CompareOpc = 710;
10919 isDot = true;
10920 break;
10921 case Intrinsic::ppc_altivec_vcmpgtsb_p:
10922 CompareOpc = 774;
10923 isDot = true;
10924 break;
10925 case Intrinsic::ppc_altivec_vcmpgtsh_p:
10926 CompareOpc = 838;
10927 isDot = true;
10928 break;
10929 case Intrinsic::ppc_altivec_vcmpgtsw_p:
10930 CompareOpc = 902;
10931 isDot = true;
10932 break;
10933 case Intrinsic::ppc_altivec_vcmpgtsd_p:
10934 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10935 CompareOpc = 967;
10936 isDot = true;
10937 } else
10938 return false;
10939 break;
10940 case Intrinsic::ppc_altivec_vcmpgtub_p:
10941 CompareOpc = 518;
10942 isDot = true;
10943 break;
10944 case Intrinsic::ppc_altivec_vcmpgtuh_p:
10945 CompareOpc = 582;
10946 isDot = true;
10947 break;
10948 case Intrinsic::ppc_altivec_vcmpgtuw_p:
10949 CompareOpc = 646;
10950 isDot = true;
10951 break;
10952 case Intrinsic::ppc_altivec_vcmpgtud_p:
10953 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10954 CompareOpc = 711;
10955 isDot = true;
10956 } else
10957 return false;
10958 break;
10959
10960 case Intrinsic::ppc_altivec_vcmpequq:
10961 case Intrinsic::ppc_altivec_vcmpgtsq:
10962 case Intrinsic::ppc_altivec_vcmpgtuq:
10963 if (!Subtarget.isISA3_1())
10964 return false;
10965 switch (IntrinsicID) {
10966 default:
10967 llvm_unreachable("Unknown comparison intrinsic.");
10968 case Intrinsic::ppc_altivec_vcmpequq:
10969 CompareOpc = 455;
10970 break;
10971 case Intrinsic::ppc_altivec_vcmpgtsq:
10972 CompareOpc = 903;
10973 break;
10974 case Intrinsic::ppc_altivec_vcmpgtuq:
10975 CompareOpc = 647;
10976 break;
10977 }
10978 break;
10979
10980 // VSX predicate comparisons use the same infrastructure
10981 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10982 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10983 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10984 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10985 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10986 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10987 if (Subtarget.hasVSX()) {
10988 switch (IntrinsicID) {
10989 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10990 CompareOpc = 99;
10991 break;
10992 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10993 CompareOpc = 115;
10994 break;
10995 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10996 CompareOpc = 107;
10997 break;
10998 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10999 CompareOpc = 67;
11000 break;
11001 case Intrinsic::ppc_vsx_xvcmpgesp_p:
11002 CompareOpc = 83;
11003 break;
11004 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
11005 CompareOpc = 75;
11006 break;
11007 }
11008 isDot = true;
11009 } else
11010 return false;
11011 break;
11012
11013 // Normal Comparisons.
11014 case Intrinsic::ppc_altivec_vcmpbfp:
11015 CompareOpc = 966;
11016 break;
11017 case Intrinsic::ppc_altivec_vcmpeqfp:
11018 CompareOpc = 198;
11019 break;
11020 case Intrinsic::ppc_altivec_vcmpequb:
11021 CompareOpc = 6;
11022 break;
11023 case Intrinsic::ppc_altivec_vcmpequh:
11024 CompareOpc = 70;
11025 break;
11026 case Intrinsic::ppc_altivec_vcmpequw:
11027 CompareOpc = 134;
11028 break;
11029 case Intrinsic::ppc_altivec_vcmpequd:
11030 if (Subtarget.hasP8Altivec())
11031 CompareOpc = 199;
11032 else
11033 return false;
11034 break;
11035 case Intrinsic::ppc_altivec_vcmpneb:
11036 case Intrinsic::ppc_altivec_vcmpneh:
11037 case Intrinsic::ppc_altivec_vcmpnew:
11038 case Intrinsic::ppc_altivec_vcmpnezb:
11039 case Intrinsic::ppc_altivec_vcmpnezh:
11040 case Intrinsic::ppc_altivec_vcmpnezw:
11041 if (Subtarget.hasP9Altivec())
11042 switch (IntrinsicID) {
11043 default:
11044 llvm_unreachable("Unknown comparison intrinsic.");
11045 case Intrinsic::ppc_altivec_vcmpneb:
11046 CompareOpc = 7;
11047 break;
11048 case Intrinsic::ppc_altivec_vcmpneh:
11049 CompareOpc = 71;
11050 break;
11051 case Intrinsic::ppc_altivec_vcmpnew:
11052 CompareOpc = 135;
11053 break;
11054 case Intrinsic::ppc_altivec_vcmpnezb:
11055 CompareOpc = 263;
11056 break;
11057 case Intrinsic::ppc_altivec_vcmpnezh:
11058 CompareOpc = 327;
11059 break;
11060 case Intrinsic::ppc_altivec_vcmpnezw:
11061 CompareOpc = 391;
11062 break;
11063 }
11064 else
11065 return false;
11066 break;
11067 case Intrinsic::ppc_altivec_vcmpgefp:
11068 CompareOpc = 454;
11069 break;
11070 case Intrinsic::ppc_altivec_vcmpgtfp:
11071 CompareOpc = 710;
11072 break;
11073 case Intrinsic::ppc_altivec_vcmpgtsb:
11074 CompareOpc = 774;
11075 break;
11076 case Intrinsic::ppc_altivec_vcmpgtsh:
11077 CompareOpc = 838;
11078 break;
11079 case Intrinsic::ppc_altivec_vcmpgtsw:
11080 CompareOpc = 902;
11081 break;
11082 case Intrinsic::ppc_altivec_vcmpgtsd:
11083 if (Subtarget.hasP8Altivec())
11084 CompareOpc = 967;
11085 else
11086 return false;
11087 break;
11088 case Intrinsic::ppc_altivec_vcmpgtub:
11089 CompareOpc = 518;
11090 break;
11091 case Intrinsic::ppc_altivec_vcmpgtuh:
11092 CompareOpc = 582;
11093 break;
11094 case Intrinsic::ppc_altivec_vcmpgtuw:
11095 CompareOpc = 646;
11096 break;
11097 case Intrinsic::ppc_altivec_vcmpgtud:
11098 if (Subtarget.hasP8Altivec())
11099 CompareOpc = 711;
11100 else
11101 return false;
11102 break;
11103 case Intrinsic::ppc_altivec_vcmpequq_p:
11104 case Intrinsic::ppc_altivec_vcmpgtsq_p:
11105 case Intrinsic::ppc_altivec_vcmpgtuq_p:
11106 if (!Subtarget.isISA3_1())
11107 return false;
11108 switch (IntrinsicID) {
11109 default:
11110 llvm_unreachable("Unknown comparison intrinsic.");
11111 case Intrinsic::ppc_altivec_vcmpequq_p:
11112 CompareOpc = 455;
11113 break;
11114 case Intrinsic::ppc_altivec_vcmpgtsq_p:
11115 CompareOpc = 903;
11116 break;
11117 case Intrinsic::ppc_altivec_vcmpgtuq_p:
11118 CompareOpc = 647;
11119 break;
11120 }
11121 isDot = true;
11122 break;
11123 }
11124 return true;
11125}
11126
11127/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
11128/// lower, do it, otherwise return null.
11129SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
11130 SelectionDAG &DAG) const {
11131 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
11132
11133 SDLoc dl(Op);
11134 // Note: BCD instructions expect the immediate operand in vector form (v4i32),
11135 // but the builtin provides it as a scalar. To satisfy the instruction
11136 // encoding, we splat the scalar across all lanes using SPLAT_VECTOR.
11137 auto MapNodeWithSplatVector =
11138 [&](unsigned Opcode,
11139 std::initializer_list<SDValue> ExtraOps = {}) -> SDValue {
11140 SDValue SplatVal =
11141 DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: dl, VT: MVT::v4i32, Operand: Op.getOperand(i: 2));
11142
11143 SmallVector<SDValue, 4> Ops{SplatVal, Op.getOperand(i: 1)};
11144 Ops.append(in_start: ExtraOps.begin(), in_end: ExtraOps.end());
11145 return DAG.getNode(Opcode, DL: dl, VT: MVT::v16i8, Ops);
11146 };
11147
11148 switch (IntrinsicID) {
11149 case Intrinsic::thread_pointer:
11150 // Reads the thread pointer register, used for __builtin_thread_pointer.
11151 if (Subtarget.isPPC64())
11152 return DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
11153 return DAG.getRegister(Reg: PPC::R2, VT: MVT::i32);
11154
11155 case Intrinsic::ppc_rldimi: {
11156 assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");
11157 SDValue Src = Op.getOperand(i: 1);
11158 APInt Mask = Op.getConstantOperandAPInt(i: 4);
11159 if (Mask.isZero())
11160 return Op.getOperand(i: 2);
11161 if (Mask.isAllOnes())
11162 return DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src, N2: Op.getOperand(i: 3));
11163 uint64_t SH = Op.getConstantOperandVal(i: 3);
11164 unsigned MB = 0, ME = 0;
11165 if (!isRunOfOnes64(Val: Mask.getZExtValue(), MB, ME))
11166 report_fatal_error(reason: "invalid rldimi mask!");
11167 // rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
11168 if (ME < 63 - SH) {
11169 Src = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src,
11170 N2: DAG.getConstant(Val: ME + SH + 1, DL: dl, VT: MVT::i32));
11171 } else if (ME > 63 - SH) {
11172 Src = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src,
11173 N2: DAG.getConstant(Val: ME + SH - 63, DL: dl, VT: MVT::i32));
11174 }
11175 return SDValue(
11176 DAG.getMachineNode(Opcode: PPC::RLDIMI, dl, VT: MVT::i64,
11177 Ops: {Op.getOperand(i: 2), Src,
11178 DAG.getTargetConstant(Val: 63 - ME, DL: dl, VT: MVT::i32),
11179 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32)}),
11180 0);
11181 }
11182
11183 case Intrinsic::ppc_rlwimi: {
11184 APInt Mask = Op.getConstantOperandAPInt(i: 4);
11185 if (Mask.isZero())
11186 return Op.getOperand(i: 2);
11187 if (Mask.isAllOnes())
11188 return DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i32, N1: Op.getOperand(i: 1),
11189 N2: Op.getOperand(i: 3));
11190 unsigned MB = 0, ME = 0;
11191 if (!isRunOfOnes(Val: Mask.getZExtValue(), MB, ME))
11192 report_fatal_error(reason: "invalid rlwimi mask!");
11193 return SDValue(DAG.getMachineNode(
11194 Opcode: PPC::RLWIMI, dl, VT: MVT::i32,
11195 Ops: {Op.getOperand(i: 2), Op.getOperand(i: 1), Op.getOperand(i: 3),
11196 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32),
11197 DAG.getTargetConstant(Val: ME, DL: dl, VT: MVT::i32)}),
11198 0);
11199 }
11200
11201 case Intrinsic::ppc_bcdshift:
11202 return MapNodeWithSplatVector(PPCISD::BCDSHIFT, {Op.getOperand(i: 3)});
11203 case Intrinsic::ppc_bcdshiftround:
11204 return MapNodeWithSplatVector(PPCISD::BCDSHIFTROUND, {Op.getOperand(i: 3)});
11205 case Intrinsic::ppc_bcdtruncate:
11206 return MapNodeWithSplatVector(PPCISD::BCDTRUNC, {Op.getOperand(i: 3)});
11207 case Intrinsic::ppc_bcdunsignedtruncate:
11208 return MapNodeWithSplatVector(PPCISD::BCDUTRUNC);
11209 case Intrinsic::ppc_bcdunsignedshift:
11210 return MapNodeWithSplatVector(PPCISD::BCDUSHIFT);
11211
11212 case Intrinsic::ppc_rlwnm: {
11213 if (Op.getConstantOperandVal(i: 3) == 0)
11214 return DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32);
11215 unsigned MB = 0, ME = 0;
11216 if (!isRunOfOnes(Val: Op.getConstantOperandVal(i: 3), MB, ME))
11217 report_fatal_error(reason: "invalid rlwnm mask!");
11218 return SDValue(
11219 DAG.getMachineNode(Opcode: PPC::RLWNM, dl, VT: MVT::i32,
11220 Ops: {Op.getOperand(i: 1), Op.getOperand(i: 2),
11221 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32),
11222 DAG.getTargetConstant(Val: ME, DL: dl, VT: MVT::i32)}),
11223 0);
11224 }
11225
11226 case Intrinsic::ppc_mma_disassemble_acc: {
11227 if (Subtarget.isISAFuture()) {
11228 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11229 SDValue WideVec =
11230 SDValue(DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes,
11231 Ops: Op.getOperand(i: 1)),
11232 0);
11233 SmallVector<SDValue, 4> RetOps;
11234 SDValue Value = SDValue(WideVec.getNode(), 0);
11235 SDValue Value2 = SDValue(WideVec.getNode(), 1);
11236
11237 SDValue Extract;
11238 Extract = DAG.getNode(
11239 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11240 N1: Subtarget.isLittleEndian() ? Value2 : Value,
11241 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 1 : 0,
11242 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11243 RetOps.push_back(Elt: Extract);
11244 Extract = DAG.getNode(
11245 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11246 N1: Subtarget.isLittleEndian() ? Value2 : Value,
11247 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 0 : 1,
11248 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11249 RetOps.push_back(Elt: Extract);
11250 Extract = DAG.getNode(
11251 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11252 N1: Subtarget.isLittleEndian() ? Value : Value2,
11253 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 1 : 0,
11254 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11255 RetOps.push_back(Elt: Extract);
11256 Extract = DAG.getNode(
11257 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11258 N1: Subtarget.isLittleEndian() ? Value : Value2,
11259 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 0 : 1,
11260 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11261 RetOps.push_back(Elt: Extract);
11262 return DAG.getMergeValues(Ops: RetOps, dl);
11263 }
11264 [[fallthrough]];
11265 }
11266 case Intrinsic::ppc_vsx_disassemble_pair: {
11267 int NumVecs = 2;
11268 SDValue WideVec = Op.getOperand(i: 1);
11269 if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
11270 NumVecs = 4;
11271 WideVec = DAG.getNode(Opcode: PPCISD::XXMFACC, DL: dl, VT: MVT::v512i1, Operand: WideVec);
11272 }
11273 SmallVector<SDValue, 4> RetOps;
11274 for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
11275 SDValue Extract = DAG.getNode(
11276 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8, N1: WideVec,
11277 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
11278 : VecNo,
11279 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11280 RetOps.push_back(Elt: Extract);
11281 }
11282 return DAG.getMergeValues(Ops: RetOps, dl);
11283 }
11284
11285 case Intrinsic::ppc_build_dmr: {
11286 SmallVector<SDValue, 8> Pairs;
11287 SmallVector<SDValue, 8> Chains;
11288 for (int i = 1; i < 9; i += 2) {
11289 SDValue Hi = Op.getOperand(i);
11290 SDValue Lo = Op.getOperand(i: i + 1);
11291 if (Hi->getOpcode() == ISD::LOAD)
11292 Chains.push_back(Elt: Hi.getValue(R: 1));
11293 if (Lo->getOpcode() == ISD::LOAD)
11294 Chains.push_back(Elt: Lo.getValue(R: 1));
11295 Pairs.push_back(
11296 Elt: DAG.getNode(Opcode: PPCISD::PAIR_BUILD, DL: dl, VT: MVT::v256i1, Ops: {Hi, Lo}));
11297 }
11298 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Chains);
11299 SDValue Value = DMFInsert1024(Pairs, dl: SDLoc(Op), DAG);
11300 return DAG.getMergeValues(Ops: {Value, TF}, dl);
11301 }
11302
11303 case Intrinsic::ppc_mma_dmxxextfdmr512: {
11304 assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
11305 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11306 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11307 "Specify P of 0 or 1 for lower or upper 512 bytes");
11308 unsigned HiLo = Idx->getSExtValue();
11309 unsigned Opcode;
11310 unsigned Subx;
11311 if (HiLo == 0) {
11312 Opcode = PPC::DMXXEXTFDMR512;
11313 Subx = PPC::sub_wacc_lo;
11314 } else {
11315 Opcode = PPC::DMXXEXTFDMR512_HI;
11316 Subx = PPC::sub_wacc_hi;
11317 }
11318 SDValue Subreg(
11319 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
11320 Op1: Op.getOperand(i: 1),
11321 Op2: DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32)),
11322 0);
11323 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11324 return SDValue(DAG.getMachineNode(Opcode, dl, ResultTys: ReturnTypes, Ops: Subreg), 0);
11325 }
11326
11327 case Intrinsic::ppc_mma_dmxxextfdmr256: {
11328 assert(Subtarget.isISAFuture() && "dmxxextfdmr256 requires ISA Future");
11329 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11330 assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
11331 "Specify a dmr row pair 0-3");
11332 unsigned IdxVal = Idx->getSExtValue();
11333 unsigned Subx;
11334 switch (IdxVal) {
11335 case 0:
11336 Subx = PPC::sub_dmrrowp0;
11337 break;
11338 case 1:
11339 Subx = PPC::sub_dmrrowp1;
11340 break;
11341 case 2:
11342 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
11343 break;
11344 case 3:
11345 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
11346 break;
11347 }
11348 SDValue Subreg(
11349 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v256i1,
11350 Op1: Op.getOperand(i: 1),
11351 Op2: DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32)),
11352 0);
11353 SDValue P = DAG.getTargetConstant(Val: IdxVal, DL: dl, VT: MVT::i32);
11354 return SDValue(
11355 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR256, dl, VT: MVT::v256i1, Ops: {Subreg, P}),
11356 0);
11357 }
11358
11359 case Intrinsic::ppc_mma_dmxxinstdmr512: {
11360 assert(Subtarget.isISAFuture() && "dmxxinstdmr512 requires ISA Future");
11361 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 4));
11362 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11363 "Specify P of 0 or 1 for lower or upper 512 bytes");
11364 unsigned HiLo = Idx->getSExtValue();
11365 unsigned Opcode;
11366 unsigned Subx;
11367 if (HiLo == 0) {
11368 Opcode = PPCISD::INST512;
11369 Subx = PPC::sub_wacc_lo;
11370 } else {
11371 Opcode = PPCISD::INST512HI;
11372 Subx = PPC::sub_wacc_hi;
11373 }
11374 SDValue Wacc = DAG.getNode(Opcode, DL: dl, VT: MVT::v512i1, N1: Op.getOperand(i: 2),
11375 N2: Op.getOperand(i: 3));
11376 SDValue SubReg = DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32);
11377 return SDValue(DAG.getMachineNode(Opcode: PPC::INSERT_SUBREG, dl, VT: MVT::v1024i1,
11378 Op1: Op.getOperand(i: 1), Op2: Wacc, Op3: SubReg),
11379 0);
11380 }
11381
11382 case Intrinsic::ppc_mma_dmxxinstdmr256: {
11383 assert(Subtarget.isISAFuture() && "dmxxinstdmr256 requires ISA Future");
11384 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
11385 assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
11386 "Specify a dmr row pair 0-3");
11387 unsigned IdxVal = Idx->getSExtValue();
11388 unsigned Subx;
11389 switch (IdxVal) {
11390 case 0:
11391 Subx = PPC::sub_dmrrowp0;
11392 break;
11393 case 1:
11394 Subx = PPC::sub_dmrrowp1;
11395 break;
11396 case 2:
11397 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
11398 break;
11399 case 3:
11400 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
11401 break;
11402 }
11403 SDValue SubReg = DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32);
11404 SDValue P = DAG.getTargetConstant(Val: IdxVal, DL: dl, VT: MVT::i32);
11405 SDValue DMRRowp =
11406 DAG.getNode(Opcode: PPCISD::INST256, DL: dl, VT: MVT::v256i1, N1: Op.getOperand(i: 2), N2: P);
11407 return SDValue(DAG.getMachineNode(Opcode: PPC::INSERT_SUBREG, dl, VT: MVT::v1024i1,
11408 Op1: Op.getOperand(i: 1), Op2: DMRRowp, Op3: SubReg),
11409 0);
11410 }
11411
11412 case Intrinsic::ppc_mma_xxmfacc:
11413 case Intrinsic::ppc_mma_xxmtacc: {
11414 // Allow pre-isa-future subtargets to lower as normal.
11415 if (!Subtarget.isISAFuture())
11416 return SDValue();
11417 // The intrinsics for xxmtacc and xxmfacc take one argument of
11418 // type v512i1, for future cpu the corresponding wacc instruction
11419 // dmxx[inst|extf]dmr512 is always generated for type v512i1, negating
11420 // the need to produce the xxm[t|f]acc.
11421 SDValue WideVec = Op.getOperand(i: 1);
11422 DAG.ReplaceAllUsesWith(From: Op, To: WideVec);
11423 return SDValue();
11424 }
11425
11426 case Intrinsic::ppc_unpack_longdouble: {
11427 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11428 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11429 "Argument of long double unpack must be 0 or 1!");
11430 return DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: dl, VT: MVT::f64, N1: Op.getOperand(i: 1),
11431 N2: DAG.getConstant(Val: !!(Idx->getSExtValue()), DL: dl,
11432 VT: Idx->getValueType(ResNo: 0)));
11433 }
11434
11435 case Intrinsic::ppc_compare_exp_lt:
11436 case Intrinsic::ppc_compare_exp_gt:
11437 case Intrinsic::ppc_compare_exp_eq:
11438 case Intrinsic::ppc_compare_exp_uo: {
11439 unsigned Pred;
11440 switch (IntrinsicID) {
11441 case Intrinsic::ppc_compare_exp_lt:
11442 Pred = PPC::PRED_LT;
11443 break;
11444 case Intrinsic::ppc_compare_exp_gt:
11445 Pred = PPC::PRED_GT;
11446 break;
11447 case Intrinsic::ppc_compare_exp_eq:
11448 Pred = PPC::PRED_EQ;
11449 break;
11450 case Intrinsic::ppc_compare_exp_uo:
11451 Pred = PPC::PRED_UN;
11452 break;
11453 }
11454 return SDValue(
11455 DAG.getMachineNode(
11456 Opcode: PPC::SELECT_CC_I4, dl, VT: MVT::i32,
11457 Ops: {SDValue(DAG.getMachineNode(Opcode: PPC::XSCMPEXPDP, dl, VT: MVT::i32,
11458 Op1: Op.getOperand(i: 1), Op2: Op.getOperand(i: 2)),
11459 0),
11460 DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32), DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32),
11461 DAG.getTargetConstant(Val: Pred, DL: dl, VT: MVT::i32)}),
11462 0);
11463 }
11464 case Intrinsic::ppc_test_data_class: {
11465 EVT OpVT = Op.getOperand(i: 1).getValueType();
11466 unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP
11467 : (OpVT == MVT::f64 ? PPC::XSTSTDCDP
11468 : PPC::XSTSTDCSP);
11469 // Lower __builtin_ppc_test_data_class(value, mask) to XSTSTDC* instruction.
11470 // The XSTSTDC* instructions test if a floating-point value matches any of
11471 // the data classes specified in the mask, setting CR field bits
11472 // accordingly. We need to extract the EQ bit (bit 2) from the CR field and
11473 // convert it to an integer result (1 if match, 0 if no match).
11474 //
11475 // Note: Operands are swapped because XSTSTDC* expects (mask, value) but the
11476 // intrinsic provides (value, mask) as Op.getOperand(1) and
11477 // Op.getOperand(2).
11478 SDValue TestDataClass =
11479 SDValue(DAG.getMachineNode(Opcode: CmprOpc, dl, VT: MVT::i32,
11480 Ops: {Op.getOperand(i: 2), Op.getOperand(i: 1)}),
11481 0);
11482 if (Subtarget.isISA3_1()) {
11483 // ISA 3.1+: Use SETBC instruction to directly convert CR bit to integer.
11484 // This is more efficient than the SELECT_CC approach used in earlier
11485 // ISAs.
11486 SDValue SubRegIdx = DAG.getTargetConstant(Val: PPC::sub_eq, DL: dl, VT: MVT::i32);
11487 SDValue CRBit =
11488 SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i1,
11489 Op1: TestDataClass, Op2: SubRegIdx),
11490 0);
11491
11492 return DAG.getNode(Opcode: PPCISD::SETBC, DL: dl, VT: MVT::i32, Operand: CRBit);
11493 }
11494
11495 // Pre-ISA 3.1: Use SELECT_CC to convert CR field to integer (1 or 0).
11496 return SDValue(
11497 DAG.getMachineNode(Opcode: PPC::SELECT_CC_I4, dl, VT: MVT::i32,
11498 Ops: {TestDataClass, DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32),
11499 DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32),
11500 DAG.getTargetConstant(Val: PPC::PRED_EQ, DL: dl, VT: MVT::i32)}),
11501 0);
11502 }
11503 case Intrinsic::ppc_fnmsub: {
11504 EVT VT = Op.getOperand(i: 1).getValueType();
11505 if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
11506 return DAG.getNode(
11507 Opcode: ISD::FNEG, DL: dl, VT,
11508 Operand: DAG.getNode(Opcode: ISD::FMA, DL: dl, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
11509 N3: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT, Operand: Op.getOperand(i: 3))));
11510 return DAG.getNode(Opcode: PPCISD::FNMSUB, DL: dl, VT, N1: Op.getOperand(i: 1),
11511 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11512 }
11513 case Intrinsic::ppc_convert_f128_to_ppcf128:
11514 case Intrinsic::ppc_convert_ppcf128_to_f128: {
11515 RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
11516 ? RTLIB::CONVERT_PPCF128_F128
11517 : RTLIB::CONVERT_F128_PPCF128;
11518 MakeLibCallOptions CallOptions;
11519 std::pair<SDValue, SDValue> Result =
11520 makeLibCall(DAG, LC, RetVT: Op.getValueType(), Ops: Op.getOperand(i: 1), CallOptions,
11521 dl, Chain: SDValue());
11522 return Result.first;
11523 }
11524 case Intrinsic::ppc_maxfe:
11525 case Intrinsic::ppc_maxfl:
11526 case Intrinsic::ppc_maxfs:
11527 case Intrinsic::ppc_minfe:
11528 case Intrinsic::ppc_minfl:
11529 case Intrinsic::ppc_minfs: {
11530 EVT VT = Op.getValueType();
11531 assert(
11532 all_of(Op->ops().drop_front(4),
11533 [VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&
11534 "ppc_[max|min]f[e|l|s] must have uniform type arguments");
11535 (void)VT;
11536 ISD::CondCode CC = ISD::SETGT;
11537 if (IntrinsicID == Intrinsic::ppc_minfe ||
11538 IntrinsicID == Intrinsic::ppc_minfl ||
11539 IntrinsicID == Intrinsic::ppc_minfs)
11540 CC = ISD::SETLT;
11541 unsigned I = Op.getNumOperands() - 2, Cnt = I;
11542 SDValue Res = Op.getOperand(i: I);
11543 for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {
11544 Res =
11545 DAG.getSelectCC(DL: dl, LHS: Res, RHS: Op.getOperand(i: I), True: Res, False: Op.getOperand(i: I), Cond: CC);
11546 }
11547 return Res;
11548 }
11549 }
11550
11551 // If this is a lowered altivec predicate compare, CompareOpc is set to the
11552 // opcode number of the comparison.
11553 int CompareOpc;
11554 bool isDot;
11555 if (!getVectorCompareInfo(Intrin: Op, CompareOpc, isDot, Subtarget))
11556 return SDValue(); // Don't custom lower most intrinsics.
11557
11558 // If this is a non-dot comparison, make the VCMP node and we are done.
11559 if (!isDot) {
11560 SDValue Tmp = DAG.getNode(Opcode: PPCISD::VCMP, DL: dl, VT: Op.getOperand(i: 2).getValueType(),
11561 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
11562 N3: DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32));
11563 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Tmp);
11564 }
11565
11566 // Create the PPCISD altivec 'dot' comparison node.
11567 SDValue Ops[] = {
11568 Op.getOperand(i: 2), // LHS
11569 Op.getOperand(i: 3), // RHS
11570 DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32)
11571 };
11572 EVT VTs[] = { Op.getOperand(i: 2).getValueType(), MVT::Glue };
11573 SDValue CompNode = DAG.getNode(Opcode: PPCISD::VCMP_rec, DL: dl, ResultTys: VTs, Ops);
11574
11575 // Unpack the result based on how the target uses it.
11576 unsigned BitNo; // Bit # of CR6.
11577 bool InvertBit; // Invert result?
11578 unsigned Bitx;
11579 unsigned SetOp;
11580 switch (Op.getConstantOperandVal(i: 1)) {
11581 default: // Can't happen, don't crash on invalid number though.
11582 case 0: // Return the value of the EQ bit of CR6.
11583 BitNo = 0;
11584 InvertBit = false;
11585 Bitx = PPC::sub_eq;
11586 SetOp = PPCISD::SETBC;
11587 break;
11588 case 1: // Return the inverted value of the EQ bit of CR6.
11589 BitNo = 0;
11590 InvertBit = true;
11591 Bitx = PPC::sub_eq;
11592 SetOp = PPCISD::SETBCR;
11593 break;
11594 case 2: // Return the value of the LT bit of CR6.
11595 BitNo = 2;
11596 InvertBit = false;
11597 Bitx = PPC::sub_lt;
11598 SetOp = PPCISD::SETBC;
11599 break;
11600 case 3: // Return the inverted value of the LT bit of CR6.
11601 BitNo = 2;
11602 InvertBit = true;
11603 Bitx = PPC::sub_lt;
11604 SetOp = PPCISD::SETBCR;
11605 break;
11606 }
11607
11608 SDValue GlueOp = CompNode.getValue(R: 1);
11609 if (Subtarget.isISA3_1()) {
11610 SDValue SubRegIdx = DAG.getTargetConstant(Val: Bitx, DL: dl, VT: MVT::i32);
11611 SDValue CR6Reg = DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32);
11612 SDValue CRBit =
11613 SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i1,
11614 Op1: CR6Reg, Op2: SubRegIdx, Op3: GlueOp),
11615 0);
11616 return DAG.getNode(Opcode: SetOp, DL: dl, VT: MVT::i32, Operand: CRBit);
11617 }
11618
11619 // Now that we have the comparison, emit a copy from the CR to a GPR.
11620 // This is flagged to the above dot comparison.
11621 SDValue Flags = DAG.getNode(Opcode: PPCISD::MFOCRF, DL: dl, VT: MVT::i32,
11622 N1: DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32), N2: GlueOp);
11623
11624 // Shift the bit into the low position.
11625 Flags = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32, N1: Flags,
11626 N2: DAG.getConstant(Val: 8 - (3 - BitNo), DL: dl, VT: MVT::i32));
11627 // Isolate the bit.
11628 Flags = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: Flags,
11629 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
11630
11631 // If we are supposed to, toggle the bit.
11632 if (InvertBit)
11633 Flags = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32, N1: Flags,
11634 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
11635 return Flags;
11636}
11637
11638SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11639 SelectionDAG &DAG) const {
11640 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
11641 // the beginning of the argument list.
11642 int ArgStart = isa<ConstantSDNode>(Val: Op.getOperand(i: 0)) ? 0 : 1;
11643 SDLoc DL(Op);
11644 switch (Op.getConstantOperandVal(i: ArgStart)) {
11645 case Intrinsic::ppc_cfence: {
11646 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
11647 SDValue Val = Op.getOperand(i: ArgStart + 1);
11648 EVT Ty = Val.getValueType();
11649 if (Ty == MVT::i128) {
11650 // FIXME: Testing one of two paired registers is sufficient to guarantee
11651 // ordering?
11652 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i64, Operand: Val);
11653 }
11654 unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE;
11655 return SDValue(
11656 DAG.getMachineNode(
11657 Opcode, dl: DL, VT: MVT::Other,
11658 Op1: DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: Subtarget.getScalarIntVT(), Operand: Val),
11659 Op2: Op.getOperand(i: 0)),
11660 0);
11661 }
11662 case Intrinsic::ppc_disassemble_dmr: {
11663 assert(ArgStart == 1 &&
11664 "llvm.ppc.disassemble.dmr must carry a chain argument.");
11665 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: Op.getOperand(i: ArgStart + 2),
11666 Ptr: Op.getOperand(i: ArgStart + 1), PtrInfo: MachinePointerInfo());
11667 }
11668 default:
11669 break;
11670 }
11671 return SDValue();
11672}
11673
11674// Lower scalar BSWAP64 to xxbrd.
11675SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
11676 SDLoc dl(Op);
11677 if (!Subtarget.isPPC64())
11678 return Op;
11679
11680 if (Subtarget.hasP9Vector()) {
11681 // MTVSRDD
11682 Op = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: dl, VT: MVT::v2i64, N1: Op.getOperand(i: 0),
11683 N2: Op.getOperand(i: 0));
11684 // XXBRD
11685 Op = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v2i64, Operand: Op);
11686 // MFVSRD
11687 int VectorIndex = 0;
11688 if (Subtarget.isLittleEndian())
11689 VectorIndex = 1;
11690 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
11691 N2: DAG.getTargetConstant(Val: VectorIndex, DL: dl, VT: MVT::i32));
11692 return Op;
11693 }
11694
11695 // For Power8, use parallel rotate instructions for faster bswap64.
11696 SDValue Input = Op.getOperand(i: 0);
11697 // Helper to create rotate-and-insert operations (RLWIMI/RLDIMI).
11698 auto CreateRotateInsert =
11699 [&](unsigned Opcode, MVT VT, SDValue Dest, SDValue Src, unsigned RotAmt,
11700 unsigned MaskBegin,
11701 std::optional<unsigned> MaskEnd = std::nullopt) -> SDValue {
11702 SmallVector<SDValue, 5> Ops = {
11703 Dest, Src, DAG.getTargetConstant(Val: RotAmt, DL: dl, VT: MVT::i32),
11704 DAG.getTargetConstant(Val: MaskBegin, DL: dl, VT: MVT::i32)};
11705 if (MaskEnd.has_value())
11706 Ops.push_back(Elt: DAG.getTargetConstant(Val: *MaskEnd, DL: dl, VT: MVT::i32));
11707
11708 return SDValue(DAG.getMachineNode(Opcode, dl, VT, Ops), 0);
11709 };
11710
11711 // Helper to perform 32-bit byte swap using rotl(8) + 2x rlwimi.
11712 auto Swap32 = [&](SDValue Val32) -> SDValue {
11713 SDValue Rot = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i32, N1: Val32,
11714 N2: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32));
11715 // Insert bits [24:31] from Val32 into Rot at position [0:7].
11716 SDValue Swap =
11717 CreateRotateInsert(PPC::RLWIMI, MVT::i32, Rot, Val32, 24, 0, 7);
11718 // Insert bits [16:23] from Val32 into Swap at position [16:23].
11719 return CreateRotateInsert(PPC::RLWIMI, MVT::i32, Swap, Val32, 24, 16, 23);
11720 };
11721 // Extract and swap high and low 32-bit halves independently for parallelism.
11722 SDValue Hi32 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32,
11723 Operand: DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: Input,
11724 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i64)));
11725 SDValue Lo32 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Input);
11726
11727 // Combine swapped halves: rotate LoSwap left by 32 bits and insert into
11728 // HiSwap to swap their positions, completing the 64-bit byte reversal.
11729 SDValue HiSwap = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i64, Operand: Swap32(Hi32));
11730 SDValue LoSwap = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i64, Operand: Swap32(Lo32));
11731
11732 return CreateRotateInsert(PPC::RLDIMI, MVT::i64, HiSwap, LoSwap, 32, 0);
11733}
11734
11735// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
11736// compared to a value that is atomically loaded (atomic loads zero-extend).
11737SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11738 SelectionDAG &DAG) const {
11739 assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
11740 "Expecting an atomic compare-and-swap here.");
11741 SDLoc dl(Op);
11742 auto *AtomicNode = cast<AtomicSDNode>(Val: Op.getNode());
11743 EVT MemVT = AtomicNode->getMemoryVT();
11744 if (MemVT.getSizeInBits() >= 32)
11745 return Op;
11746
11747 SDValue CmpOp = Op.getOperand(i: 2);
11748 // If this is already correctly zero-extended, leave it alone.
11749 auto HighBits = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - MemVT.getSizeInBits());
11750 if (DAG.MaskedValueIsZero(Op: CmpOp, Mask: HighBits))
11751 return Op;
11752
11753 // Clear the high bits of the compare operand.
11754 unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
11755 SDValue NewCmpOp =
11756 DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: CmpOp,
11757 N2: DAG.getConstant(Val: MaskVal, DL: dl, VT: MVT::i32));
11758
11759 // Replace the existing compare operand with the properly zero-extended one.
11760 SmallVector<SDValue, 4> Ops;
11761 for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
11762 Ops.push_back(Elt: AtomicNode->getOperand(Num: i));
11763 Ops[2] = NewCmpOp;
11764 MachineMemOperand *MMO = AtomicNode->getMemOperand();
11765 SDVTList Tys = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
11766 auto NodeTy =
11767 (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
11768 return DAG.getMemIntrinsicNode(Opcode: NodeTy, dl, VTList: Tys, Ops, MemVT, MMO);
11769}
11770
11771SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
11772 SelectionDAG &DAG) const {
11773 AtomicSDNode *N = cast<AtomicSDNode>(Val: Op.getNode());
11774 EVT MemVT = N->getMemoryVT();
11775 assert(MemVT.getSimpleVT() == MVT::i128 &&
11776 "Expect quadword atomic operations");
11777 SDLoc dl(N);
11778 unsigned Opc = N->getOpcode();
11779 switch (Opc) {
11780 case ISD::ATOMIC_LOAD: {
11781 // Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
11782 // lowered to ppc instructions by pattern matching instruction selector.
11783 SDVTList Tys = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other);
11784 SmallVector<SDValue, 4> Ops{
11785 N->getOperand(Num: 0),
11786 DAG.getConstant(Val: Intrinsic::ppc_atomic_load_i128, DL: dl, VT: MVT::i32)};
11787 for (int I = 1, E = N->getNumOperands(); I < E; ++I)
11788 Ops.push_back(Elt: N->getOperand(Num: I));
11789 SDValue LoadedVal = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl, VTList: Tys,
11790 Ops, MemVT, MMO: N->getMemOperand());
11791 SDValue ValLo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i128, Operand: LoadedVal);
11792 SDValue ValHi =
11793 DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i128, Operand: LoadedVal.getValue(R: 1));
11794 ValHi = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i128, N1: ValHi,
11795 N2: DAG.getConstant(Val: 64, DL: dl, VT: MVT::i32));
11796 SDValue Val =
11797 DAG.getNode(Opcode: ISD::OR, DL: dl, ResultTys: {MVT::i128, MVT::Other}, Ops: {ValLo, ValHi});
11798 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, ResultTys: {MVT::i128, MVT::Other},
11799 Ops: {Val, LoadedVal.getValue(R: 2)});
11800 }
11801 case ISD::ATOMIC_STORE: {
11802 // Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
11803 // lowered to ppc instructions by pattern matching instruction selector.
11804 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
11805 SmallVector<SDValue, 4> Ops{
11806 N->getOperand(Num: 0),
11807 DAG.getConstant(Val: Intrinsic::ppc_atomic_store_i128, DL: dl, VT: MVT::i32)};
11808 SDValue Val = N->getOperand(Num: 1);
11809 SDValue ValLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i64, Operand: Val);
11810 SDValue ValHi = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i128, N1: Val,
11811 N2: DAG.getConstant(Val: 64, DL: dl, VT: MVT::i32));
11812 ValHi = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i64, Operand: ValHi);
11813 Ops.push_back(Elt: ValLo);
11814 Ops.push_back(Elt: ValHi);
11815 Ops.push_back(Elt: N->getOperand(Num: 2));
11816 return DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops, MemVT,
11817 MMO: N->getMemOperand());
11818 }
11819 default:
11820 llvm_unreachable("Unexpected atomic opcode");
11821 }
11822}
11823
11824static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl,
11825 SelectionDAG &DAG,
11826 const PPCSubtarget &Subtarget) {
11827 assert(Mask <= fcAllFlags && "Invalid fp_class flags!");
11828
11829 enum DataClassMask {
11830 DC_NAN = 1 << 6,
11831 DC_NEG_INF = 1 << 4,
11832 DC_POS_INF = 1 << 5,
11833 DC_NEG_ZERO = 1 << 2,
11834 DC_POS_ZERO = 1 << 3,
11835 DC_NEG_SUBNORM = 1,
11836 DC_POS_SUBNORM = 1 << 1,
11837 };
11838
11839 EVT VT = Op.getValueType();
11840
11841 unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP
11842 : VT == MVT::f64 ? PPC::XSTSTDCDP
11843 : PPC::XSTSTDCSP;
11844
11845 if (Mask == fcAllFlags)
11846 return DAG.getBoolConstant(V: true, DL: Dl, VT: MVT::i1, OpVT: VT);
11847 if (Mask == 0)
11848 return DAG.getBoolConstant(V: false, DL: Dl, VT: MVT::i1, OpVT: VT);
11849
11850 // When it's cheaper or necessary to test reverse flags.
11851 if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) {
11852 SDValue Rev = getDataClassTest(Op, Mask: ~Mask, Dl, DAG, Subtarget);
11853 return DAG.getNOT(DL: Dl, Val: Rev, VT: MVT::i1);
11854 }
11855
11856 // Power doesn't support testing whether a value is 'normal'. Test the rest
11857 // first, and test if it's 'not not-normal' with expected sign.
11858 if (Mask & fcNormal) {
11859 SDValue Rev(DAG.getMachineNode(
11860 Opcode: TestOp, dl: Dl, VT: MVT::i32,
11861 Op1: DAG.getTargetConstant(Val: DC_NAN | DC_NEG_INF | DC_POS_INF |
11862 DC_NEG_ZERO | DC_POS_ZERO |
11863 DC_NEG_SUBNORM | DC_POS_SUBNORM,
11864 DL: Dl, VT: MVT::i32),
11865 Op2: Op),
11866 0);
11867 // Sign are stored in CR bit 0, result are in CR bit 2.
11868 SDValue Sign(
11869 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1, Op1: Rev,
11870 Op2: DAG.getTargetConstant(Val: PPC::sub_lt, DL: Dl, VT: MVT::i32)),
11871 0);
11872 SDValue Normal(DAG.getNOT(
11873 DL: Dl,
11874 Val: SDValue(DAG.getMachineNode(
11875 Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1, Op1: Rev,
11876 Op2: DAG.getTargetConstant(Val: PPC::sub_eq, DL: Dl, VT: MVT::i32)),
11877 0),
11878 VT: MVT::i1));
11879 if (Mask & fcPosNormal)
11880 Sign = DAG.getNOT(DL: Dl, Val: Sign, VT: MVT::i1);
11881 SDValue Result = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i1, N1: Sign, N2: Normal);
11882 if (Mask == fcPosNormal || Mask == fcNegNormal)
11883 return Result;
11884
11885 return DAG.getNode(
11886 Opcode: ISD::OR, DL: Dl, VT: MVT::i1,
11887 N1: getDataClassTest(Op, Mask: Mask & ~fcNormal, Dl, DAG, Subtarget), N2: Result);
11888 }
11889
11890 // The instruction doesn't differentiate between signaling or quiet NaN. Test
11891 // the rest first, and test if it 'is NaN and is signaling/quiet'.
11892 if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) {
11893 bool IsQuiet = Mask & fcQNan;
11894 SDValue NanCheck = getDataClassTest(Op, Mask: fcNan, Dl, DAG, Subtarget);
11895
11896 // Quietness is determined by the first bit in fraction field.
11897 uint64_t QuietMask = 0;
11898 SDValue HighWord;
11899 if (VT == MVT::f128) {
11900 HighWord = DAG.getNode(
11901 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: Dl, VT: MVT::i32, N1: DAG.getBitcast(VT: MVT::v4i32, V: Op),
11902 N2: DAG.getVectorIdxConstant(Val: Subtarget.isLittleEndian() ? 3 : 0, DL: Dl));
11903 QuietMask = 0x8000;
11904 } else if (VT == MVT::f64) {
11905 if (Subtarget.isPPC64()) {
11906 HighWord = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: Dl, VT: MVT::i32,
11907 N1: DAG.getBitcast(VT: MVT::i64, V: Op),
11908 N2: DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32));
11909 } else {
11910 SDValue Vec = DAG.getBitcast(
11911 VT: MVT::v4i32, V: DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: Dl, VT: MVT::v2f64, Operand: Op));
11912 HighWord = DAG.getNode(
11913 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: Dl, VT: MVT::i32, N1: Vec,
11914 N2: DAG.getVectorIdxConstant(Val: Subtarget.isLittleEndian() ? 1 : 0, DL: Dl));
11915 }
11916 QuietMask = 0x80000;
11917 } else if (VT == MVT::f32) {
11918 HighWord = DAG.getBitcast(VT: MVT::i32, V: Op);
11919 QuietMask = 0x400000;
11920 }
11921 SDValue NanRes = DAG.getSetCC(
11922 DL: Dl, VT: MVT::i1,
11923 LHS: DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32, N1: HighWord,
11924 N2: DAG.getConstant(Val: QuietMask, DL: Dl, VT: MVT::i32)),
11925 RHS: DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i32), Cond: IsQuiet ? ISD::SETNE : ISD::SETEQ);
11926 NanRes = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i1, N1: NanCheck, N2: NanRes);
11927 if (Mask == fcQNan || Mask == fcSNan)
11928 return NanRes;
11929
11930 return DAG.getNode(Opcode: ISD::OR, DL: Dl, VT: MVT::i1,
11931 N1: getDataClassTest(Op, Mask: Mask & ~fcNan, Dl, DAG, Subtarget),
11932 N2: NanRes);
11933 }
11934
11935 unsigned NativeMask = 0;
11936 if ((Mask & fcNan) == fcNan)
11937 NativeMask |= DC_NAN;
11938 if (Mask & fcNegInf)
11939 NativeMask |= DC_NEG_INF;
11940 if (Mask & fcPosInf)
11941 NativeMask |= DC_POS_INF;
11942 if (Mask & fcNegZero)
11943 NativeMask |= DC_NEG_ZERO;
11944 if (Mask & fcPosZero)
11945 NativeMask |= DC_POS_ZERO;
11946 if (Mask & fcNegSubnormal)
11947 NativeMask |= DC_NEG_SUBNORM;
11948 if (Mask & fcPosSubnormal)
11949 NativeMask |= DC_POS_SUBNORM;
11950 return SDValue(
11951 DAG.getMachineNode(
11952 Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1,
11953 Op1: SDValue(DAG.getMachineNode(
11954 Opcode: TestOp, dl: Dl, VT: MVT::i32,
11955 Op1: DAG.getTargetConstant(Val: NativeMask, DL: Dl, VT: MVT::i32), Op2: Op),
11956 0),
11957 Op2: DAG.getTargetConstant(Val: PPC::sub_eq, DL: Dl, VT: MVT::i32)),
11958 0);
11959}
11960
11961SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
11962 SelectionDAG &DAG) const {
11963 assert(Subtarget.hasP9Vector() && "Test data class requires Power9");
11964 SDValue LHS = Op.getOperand(i: 0);
11965 uint64_t RHSC = Op.getConstantOperandVal(i: 1);
11966 SDLoc Dl(Op);
11967 FPClassTest Category = static_cast<FPClassTest>(RHSC);
11968 if (LHS.getValueType() == MVT::ppcf128) {
11969 // The higher part determines the value class.
11970 LHS = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: Dl, VT: MVT::f64, N1: LHS,
11971 N2: DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32));
11972 }
11973
11974 return getDataClassTest(Op: LHS, Mask: Category, Dl, DAG, Subtarget);
11975}
11976
11977// Adjust the length value for a load/store with length to account for the
11978// instructions requiring a left justified length, and for non-byte element
11979// types requiring scaling by element size.
11980static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left,
11981 SelectionDAG &DAG) {
11982 SDLoc dl(Val);
11983 EVT VT = Val->getValueType(ResNo: 0);
11984 unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0;
11985 unsigned TypeAdj = llvm::countr_zero<uint32_t>(Val: Bits / 8);
11986 SDValue SHLAmt = DAG.getConstant(Val: LeftAdj + TypeAdj, DL: dl, VT);
11987 return DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: Val, N2: SHLAmt);
11988}
11989
11990SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const {
11991 auto VPLD = cast<VPLoadSDNode>(Val&: Op);
11992 bool Future = Subtarget.isISAFuture();
11993 SDLoc dl(Op);
11994 assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) &&
11995 "Mask predication not supported");
11996 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11997 SDValue Len = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: PtrVT, Operand: VPLD->getOperand(Num: 4));
11998 unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl;
11999 unsigned EltBits = Op->getValueType(ResNo: 0).getScalarType().getSizeInBits();
12000 Len = AdjustLength(Val: Len, Bits: EltBits, Left: !Future, DAG);
12001 SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32),
12002 VPLD->getOperand(Num: 1), Len};
12003 SDVTList Tys = DAG.getVTList(VT1: Op->getValueType(ResNo: 0), VT2: MVT::Other);
12004 SDValue VPL =
12005 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl, VTList: Tys, Ops,
12006 MemVT: VPLD->getMemoryVT(), MMO: VPLD->getMemOperand());
12007 return VPL;
12008}
12009
12010SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const {
12011 auto VPST = cast<VPStoreSDNode>(Val&: Op);
12012 assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) &&
12013 "Mask predication not supported");
12014 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
12015 SDLoc dl(Op);
12016 SDValue Len = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: PtrVT, Operand: VPST->getOperand(Num: 5));
12017 unsigned EltBits =
12018 Op->getOperand(Num: 1).getValueType().getScalarType().getSizeInBits();
12019 bool Future = Subtarget.isISAFuture();
12020 unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl;
12021 Len = AdjustLength(Val: Len, Bits: EltBits, Left: !Future, DAG);
12022 SDValue Ops[] = {
12023 VPST->getChain(), DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32),
12024 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: VPST->getOperand(Num: 1)),
12025 VPST->getOperand(Num: 2), Len};
12026 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
12027 SDValue VPS =
12028 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops,
12029 MemVT: VPST->getMemoryVT(), MMO: VPST->getMemOperand());
12030 return VPS;
12031}
12032
12033SDValue PPCTargetLowering::LowerPartialReduce(SDValue Op,
12034 SelectionDAG &DAG) const {
12035 SDValue Acc = Op.getOperand(i: 0);
12036 SDValue Op1 = Op.getOperand(i: 1);
12037 SDValue Op2 = Op.getOperand(i: 2);
12038
12039 assert(Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA &&
12040 "Unexpected partial reduction");
12041
12042 if (Acc.getValueType() != MVT::v4i32)
12043 return SDValue();
12044 if (Op1.getValueType() != MVT::v16i32 || Op1.getOpcode() != ISD::SIGN_EXTEND)
12045 return SDValue();
12046 SDValue Op1Input = Op1.getOperand(i: 0);
12047 if (Op1Input.getValueType() != MVT::v16i8 || !llvm::isOneOrOneSplat(V: Op2))
12048 return SDValue();
12049
12050 SDLoc dl(Op);
12051 SDValue Ones = DAG.getConstant(Val: 1, DL: dl, VT: MVT::v16i8);
12052 return DAG.getNode(Opcode: ISD::PARTIAL_REDUCE_SUMLA, DL: dl, VT: MVT::v4i32, N1: Acc, N2: Op1Input,
12053 N3: Ones);
12054}
12055
12056SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
12057 SelectionDAG &DAG) const {
12058 SDLoc dl(Op);
12059
12060 MachineFunction &MF = DAG.getMachineFunction();
12061 SDValue Op0 = Op.getOperand(i: 0);
12062 EVT ValVT = Op0.getValueType();
12063 unsigned EltSize = Op.getValueType().getScalarSizeInBits();
12064 if (isa<ConstantSDNode>(Val: Op0) && EltSize <= 32) {
12065 int64_t IntVal = Op.getConstantOperandVal(i: 0);
12066 if (IntVal >= -16 && IntVal <= 15)
12067 return getCanonicalConstSplat(Val: IntVal, SplatSize: EltSize / 8, VT: Op.getValueType(), DAG,
12068 dl);
12069 }
12070
12071 ReuseLoadInfo RLI;
12072 if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() &&
12073 Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD &&
12074 Op0.getValueType() == MVT::i32 && Op0.hasOneUse() &&
12075 canReuseLoadAddress(Op: Op0, MemVT: MVT::i32, RLI, DAG, ET: ISD::NON_EXTLOAD)) {
12076
12077 MachineMemOperand *MMO =
12078 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
12079 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
12080 SDValue Ops[] = {RLI.Chain, RLI.Ptr, DAG.getValueType(Op.getValueType())};
12081 SDValue Bits = DAG.getMemIntrinsicNode(
12082 Opcode: PPCISD::LD_SPLAT, dl, VTList: DAG.getVTList(VT1: MVT::v4i32, VT2: MVT::Other), Ops,
12083 MemVT: MVT::i32, MMO);
12084 if (RLI.ResChain)
12085 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
12086 return Bits.getValue(R: 0);
12087 }
12088
12089 // Create a stack slot that is 16-byte aligned.
12090 MachineFrameInfo &MFI = MF.getFrameInfo();
12091 int FrameIdx = MFI.CreateStackObject(Size: 16, Alignment: Align(16), isSpillSlot: false);
12092 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
12093 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
12094
12095 SDValue Val = Op0;
12096 // P10 hardware store forwarding requires that a single store contains all
12097 // the data for the load. P10 is able to merge a pair of adjacent stores. Try
12098 // to avoid load hit store on P10 when running binaries compiled for older
12099 // processors by generating two mergeable scalar stores to forward with the
12100 // vector load.
12101 if (!DisableP10StoreForward && Subtarget.isPPC64() &&
12102 !Subtarget.isLittleEndian() && ValVT.isInteger() &&
12103 ValVT.getSizeInBits() <= 64) {
12104 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: Val);
12105 EVT ShiftAmountTy = getShiftAmountTy(LHSTy: MVT::i64, DL: DAG.getDataLayout());
12106 SDValue ShiftBy = DAG.getConstant(
12107 Val: 64 - Op.getValueType().getScalarSizeInBits(), DL: dl, VT: ShiftAmountTy);
12108 Val = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: Val, N2: ShiftBy);
12109 SDValue Plus8 =
12110 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FIdx, N2: DAG.getConstant(Val: 8, DL: dl, VT: PtrVT));
12111 SDValue Store2 =
12112 DAG.getStore(Chain: DAG.getEntryNode(), dl, Val, Ptr: Plus8, PtrInfo: MachinePointerInfo());
12113 SDValue Store = DAG.getStore(Chain: Store2, dl, Val, Ptr: FIdx, PtrInfo: MachinePointerInfo());
12114 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: Store, Ptr: FIdx,
12115 PtrInfo: MachinePointerInfo());
12116 }
12117
12118 // Store the input value into Value#0 of the stack slot.
12119 SDValue Store =
12120 DAG.getStore(Chain: DAG.getEntryNode(), dl, Val, Ptr: FIdx, PtrInfo: MachinePointerInfo());
12121 // Load it out.
12122 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: Store, Ptr: FIdx, PtrInfo: MachinePointerInfo());
12123}
12124
12125SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12126 SelectionDAG &DAG) const {
12127 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
12128 "Should only be called for ISD::INSERT_VECTOR_ELT");
12129
12130 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
12131
12132 EVT VT = Op.getValueType();
12133 SDLoc dl(Op);
12134 SDValue V1 = Op.getOperand(i: 0);
12135 SDValue V2 = Op.getOperand(i: 1);
12136
12137 if (VT == MVT::v2f64 && C)
12138 return Op;
12139
12140 if (Subtarget.hasP9Vector()) {
12141 // A f32 load feeding into a v4f32 insert_vector_elt is handled in this way
12142 // because on P10, it allows this specific insert_vector_elt load pattern to
12143 // utilize the refactored load and store infrastructure in order to exploit
12144 // prefixed loads.
12145 // On targets with inexpensive direct moves (Power9 and up), a
12146 // (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer
12147 // load since a single precision load will involve conversion to double
12148 // precision on the load followed by another conversion to single precision.
12149 if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&
12150 (isa<LoadSDNode>(Val: V2))) {
12151 SDValue BitcastVector = DAG.getBitcast(VT: MVT::v4i32, V: V1);
12152 SDValue BitcastLoad = DAG.getBitcast(VT: MVT::i32, V: V2);
12153 SDValue InsVecElt =
12154 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: MVT::v4i32, N1: BitcastVector,
12155 N2: BitcastLoad, N3: Op.getOperand(i: 2));
12156 return DAG.getBitcast(VT: MVT::v4f32, V: InsVecElt);
12157 }
12158 }
12159
12160 if (Subtarget.isISA3_1()) {
12161 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
12162 return SDValue();
12163 // On P10, we have legal lowering for constant and variable indices for
12164 // all vectors.
12165 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12166 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
12167 return Op;
12168 }
12169
12170 // Before P10, we have legal lowering for constant indices but not for
12171 // variable ones.
12172 if (!C)
12173 return SDValue();
12174
12175 // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
12176 if (VT == MVT::v8i16 || VT == MVT::v16i8) {
12177 SDValue Mtvsrz = DAG.getNode(Opcode: PPCISD::MTVSRZ, DL: dl, VT, Operand: V2);
12178 unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
12179 unsigned InsertAtElement = C->getZExtValue();
12180 unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
12181 if (Subtarget.isLittleEndian()) {
12182 InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
12183 }
12184 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT, N1: V1, N2: Mtvsrz,
12185 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
12186 }
12187 return Op;
12188}
12189
12190SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
12191 SelectionDAG &DAG) const {
12192 SDLoc dl(Op);
12193 LoadSDNode *LN = cast<LoadSDNode>(Val: Op.getNode());
12194 SDValue LoadChain = LN->getChain();
12195 SDValue BasePtr = LN->getBasePtr();
12196 EVT VT = Op.getValueType();
12197 bool IsV1024i1 = VT == MVT::v1024i1;
12198 bool IsV2048i1 = VT == MVT::v2048i1;
12199
12200 // The types v1024i1 and v2048i1 are used for Dense Math dmr registers and
12201 // Dense Math dmr pair registers, respectively.
12202 assert((IsV1024i1 || IsV2048i1) && "Unsupported type.");
12203 (void)IsV2048i1;
12204 assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
12205 "Dense Math support required.");
12206 assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
12207
12208 SmallVector<SDValue, 8> Loads;
12209 SmallVector<SDValue, 8> LoadChains;
12210
12211 SDValue IntrinID = DAG.getConstant(Val: Intrinsic::ppc_vsx_lxvp, DL: dl, VT: MVT::i32);
12212 SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr};
12213 MachineMemOperand *MMO = LN->getMemOperand();
12214 unsigned NumVecs = VT.getSizeInBits() / 256;
12215 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12216 MachineMemOperand *NewMMO =
12217 DAG.getMachineFunction().getMachineMemOperand(MMO, Offset: Idx * 32, Size: 32);
12218 if (Idx > 0) {
12219 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12220 N2: DAG.getConstant(Val: 32, DL: dl, VT: BasePtr.getValueType()));
12221 LoadOps[2] = BasePtr;
12222 }
12223 SDValue Ld = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
12224 VTList: DAG.getVTList(VT1: MVT::v256i1, VT2: MVT::Other),
12225 Ops: LoadOps, MemVT: MVT::v256i1, MMO: NewMMO);
12226 LoadChains.push_back(Elt: Ld.getValue(R: 1));
12227 Loads.push_back(Elt: Ld);
12228 }
12229
12230 if (Subtarget.isLittleEndian()) {
12231 std::reverse(first: Loads.begin(), last: Loads.end());
12232 std::reverse(first: LoadChains.begin(), last: LoadChains.end());
12233 }
12234
12235 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: LoadChains);
12236 SDValue Value = DMFInsert1024(Pairs: Loads, dl, DAG);
12237
12238 if (IsV1024i1) {
12239 return DAG.getMergeValues(Ops: {Value, TF}, dl);
12240 }
12241
12242 // Handle Loads for V2048i1 which represents a dmr pair.
12243 SmallVector<SDValue, 4> MoreLoads{Loads[4], Loads[5], Loads[6], Loads[7]};
12244 SDValue Dmr1Value = DMFInsert1024(Pairs: MoreLoads, dl, DAG);
12245
12246 SDValue Dmr0Sub = DAG.getTargetConstant(Val: PPC::sub_dmr0, DL: dl, VT: MVT::i32);
12247 SDValue Dmr1Sub = DAG.getTargetConstant(Val: PPC::sub_dmr1, DL: dl, VT: MVT::i32);
12248
12249 SDValue DmrPRC = DAG.getTargetConstant(Val: PPC::DMRpRCRegClassID, DL: dl, VT: MVT::i32);
12250 const SDValue DmrPOps[] = {DmrPRC, Value, Dmr0Sub, Dmr1Value, Dmr1Sub};
12251
12252 SDValue DmrPValue = SDValue(
12253 DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v2048i1, Ops: DmrPOps), 0);
12254
12255 return DAG.getMergeValues(Ops: {DmrPValue, TF}, dl);
12256}
12257
12258SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
12259 const SDLoc &dl,
12260 SelectionDAG &DAG) const {
12261 SDValue Lo =
12262 DAG.getNode(Opcode: PPCISD::INST512, DL: dl, VT: MVT::v512i1, N1: Pairs[0], N2: Pairs[1]);
12263 SDValue LoSub = DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32);
12264 SDValue Hi =
12265 DAG.getNode(Opcode: PPCISD::INST512HI, DL: dl, VT: MVT::v512i1, N1: Pairs[2], N2: Pairs[3]);
12266 SDValue HiSub = DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32);
12267 SDValue RC = DAG.getTargetConstant(Val: PPC::DMRRCRegClassID, DL: dl, VT: MVT::i32);
12268
12269 return SDValue(DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v1024i1,
12270 Ops: {RC, Lo, LoSub, Hi, HiSub}),
12271 0);
12272}
12273
12274SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
12275 SelectionDAG &DAG) const {
12276 SDLoc dl(Op);
12277 LoadSDNode *LN = cast<LoadSDNode>(Val: Op.getNode());
12278 SDValue LoadChain = LN->getChain();
12279 SDValue BasePtr = LN->getBasePtr();
12280 EVT VT = Op.getValueType();
12281
12282 if (VT == MVT::v1024i1 || VT == MVT::v2048i1)
12283 return LowerDMFVectorLoad(Op, DAG);
12284
12285 if (VT != MVT::v256i1 && VT != MVT::v512i1)
12286 return Op;
12287
12288 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
12289 assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
12290 "Type unsupported without MMA");
12291 assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
12292 "Type unsupported without paired vector support");
12293
12294 // For v256i1 on ISA Future, let the load go through to instruction selection
12295 // where it will be matched to lxvp/plxvp by the instruction patterns.
12296 if (VT == MVT::v256i1 && Subtarget.isISAFuture())
12297 return Op;
12298
12299 // For other cases, create 2 or 4 v16i8 loads to load the pair or accumulator
12300 // value in 2 or 4 vsx registers.
12301 Align Alignment = LN->getAlign();
12302 SmallVector<SDValue, 4> Loads;
12303 SmallVector<SDValue, 4> LoadChains;
12304 unsigned NumVecs = VT.getSizeInBits() / 128;
12305 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12306 SDValue Load =
12307 DAG.getLoad(VT: MVT::v16i8, dl, Chain: LoadChain, Ptr: BasePtr,
12308 PtrInfo: LN->getPointerInfo().getWithOffset(O: Idx * 16),
12309 Alignment: commonAlignment(A: Alignment, Offset: Idx * 16),
12310 MMOFlags: LN->getMemOperand()->getFlags(), AAInfo: LN->getAAInfo());
12311 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12312 N2: DAG.getConstant(Val: 16, DL: dl, VT: BasePtr.getValueType()));
12313 Loads.push_back(Elt: Load);
12314 LoadChains.push_back(Elt: Load.getValue(R: 1));
12315 }
12316 if (Subtarget.isLittleEndian()) {
12317 std::reverse(first: Loads.begin(), last: Loads.end());
12318 std::reverse(first: LoadChains.begin(), last: LoadChains.end());
12319 }
12320 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: LoadChains);
12321 SDValue Value =
12322 DAG.getNode(Opcode: VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
12323 DL: dl, VT, Ops: Loads);
12324 SDValue RetOps[] = {Value, TF};
12325 return DAG.getMergeValues(Ops: RetOps, dl);
12326}
12327
12328SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op,
12329 SelectionDAG &DAG) const {
12330
12331 SDLoc dl(Op);
12332 StoreSDNode *SN = cast<StoreSDNode>(Val: Op.getNode());
12333 SDValue StoreChain = SN->getChain();
12334 SDValue BasePtr = SN->getBasePtr();
12335 SmallVector<SDValue, 8> Values;
12336 SmallVector<SDValue, 8> Stores;
12337 EVT VT = SN->getValue().getValueType();
12338 bool IsV1024i1 = VT == MVT::v1024i1;
12339 bool IsV2048i1 = VT == MVT::v2048i1;
12340
12341 // The types v1024i1 and v2048i1 are used for Dense Math dmr registers and
12342 // Dense Math dmr pair registers, respectively.
12343 assert((IsV1024i1 || IsV2048i1) && "Unsupported type.");
12344 (void)IsV2048i1;
12345 assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
12346 "Dense Math support required.");
12347 assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
12348
12349 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
12350 if (IsV1024i1) {
12351 SDValue Lo(DAG.getMachineNode(
12352 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
12353 Op1: Op.getOperand(i: 1),
12354 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12355 0);
12356 SDValue Hi(DAG.getMachineNode(
12357 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
12358 Op1: Op.getOperand(i: 1),
12359 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12360 0);
12361 MachineSDNode *ExtNode =
12362 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Lo);
12363 Values.push_back(Elt: SDValue(ExtNode, 0));
12364 Values.push_back(Elt: SDValue(ExtNode, 1));
12365 ExtNode = DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Hi);
12366 Values.push_back(Elt: SDValue(ExtNode, 0));
12367 Values.push_back(Elt: SDValue(ExtNode, 1));
12368 } else {
12369 // This corresponds to v2048i1 which represents a dmr pair.
12370 SDValue Dmr0(
12371 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v1024i1,
12372 Op1: Op.getOperand(i: 1),
12373 Op2: DAG.getTargetConstant(Val: PPC::sub_dmr0, DL: dl, VT: MVT::i32)),
12374 0);
12375
12376 SDValue Dmr1(
12377 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v1024i1,
12378 Op1: Op.getOperand(i: 1),
12379 Op2: DAG.getTargetConstant(Val: PPC::sub_dmr1, DL: dl, VT: MVT::i32)),
12380 0);
12381
12382 SDValue Dmr0Lo(DAG.getMachineNode(
12383 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr0,
12384 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12385 0);
12386
12387 SDValue Dmr0Hi(DAG.getMachineNode(
12388 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr0,
12389 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12390 0);
12391
12392 SDValue Dmr1Lo(DAG.getMachineNode(
12393 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr1,
12394 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12395 0);
12396
12397 SDValue Dmr1Hi(DAG.getMachineNode(
12398 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr1,
12399 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12400 0);
12401
12402 MachineSDNode *ExtNode =
12403 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Dmr0Lo);
12404 Values.push_back(Elt: SDValue(ExtNode, 0));
12405 Values.push_back(Elt: SDValue(ExtNode, 1));
12406 ExtNode =
12407 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Dmr0Hi);
12408 Values.push_back(Elt: SDValue(ExtNode, 0));
12409 Values.push_back(Elt: SDValue(ExtNode, 1));
12410 ExtNode = DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Dmr1Lo);
12411 Values.push_back(Elt: SDValue(ExtNode, 0));
12412 Values.push_back(Elt: SDValue(ExtNode, 1));
12413 ExtNode =
12414 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Dmr1Hi);
12415 Values.push_back(Elt: SDValue(ExtNode, 0));
12416 Values.push_back(Elt: SDValue(ExtNode, 1));
12417 }
12418
12419 if (Subtarget.isLittleEndian())
12420 std::reverse(first: Values.begin(), last: Values.end());
12421
12422 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
12423 SmallVector<SDValue, 4> Ops{
12424 StoreChain, DAG.getConstant(Val: Intrinsic::ppc_vsx_stxvp, DL: dl, VT: MVT::i32),
12425 Values[0], BasePtr};
12426 MachineMemOperand *MMO = SN->getMemOperand();
12427 unsigned NumVecs = VT.getSizeInBits() / 256;
12428 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12429 MachineMemOperand *NewMMO =
12430 DAG.getMachineFunction().getMachineMemOperand(MMO, Offset: Idx * 32, Size: 32);
12431 if (Idx > 0) {
12432 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12433 N2: DAG.getConstant(Val: 32, DL: dl, VT: BasePtr.getValueType()));
12434 Ops[3] = BasePtr;
12435 }
12436 Ops[2] = Values[Idx];
12437 SDValue St = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops,
12438 MemVT: MVT::v256i1, MMO: NewMMO);
12439 Stores.push_back(Elt: St);
12440 }
12441
12442 SDValue TF = DAG.getTokenFactor(DL: dl, Vals&: Stores);
12443 return TF;
12444}
12445
12446SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
12447 SelectionDAG &DAG) const {
12448 SDLoc dl(Op);
12449 StoreSDNode *SN = cast<StoreSDNode>(Val: Op.getNode());
12450 SDValue StoreChain = SN->getChain();
12451 SDValue BasePtr = SN->getBasePtr();
12452 SDValue Value = SN->getValue();
12453 SDValue Value2 = SN->getValue();
12454 EVT StoreVT = Value.getValueType();
12455
12456 if (StoreVT == MVT::v1024i1 || StoreVT == MVT::v2048i1)
12457 return LowerDMFVectorStore(Op, DAG);
12458
12459 if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
12460 return Op;
12461
12462 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
12463 assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
12464 "Type unsupported without MMA");
12465 assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
12466 "Type unsupported without paired vector support");
12467
12468 // For v256i1 on ISA Future, let the store go through to instruction selection
12469 // where it will be matched to stxvp/pstxvp by the instruction patterns.
12470 if (StoreVT == MVT::v256i1 && Subtarget.isISAFuture() &&
12471 !DisableAutoPairedVecSt)
12472 return Op;
12473
12474 // For other cases, create 2 or 4 v16i8 stores to store the pair or
12475 // accumulator underlying registers individually.
12476 Align Alignment = SN->getAlign();
12477 SmallVector<SDValue, 4> Stores;
12478 unsigned NumVecs = 2;
12479 if (StoreVT == MVT::v512i1) {
12480 if (Subtarget.isISAFuture()) {
12481 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
12482 MachineSDNode *ExtNode = DAG.getMachineNode(
12483 Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Op.getOperand(i: 1));
12484
12485 Value = SDValue(ExtNode, 0);
12486 Value2 = SDValue(ExtNode, 1);
12487 } else
12488 Value = DAG.getNode(Opcode: PPCISD::XXMFACC, DL: dl, VT: MVT::v512i1, Operand: Value);
12489 NumVecs = 4;
12490 }
12491 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12492 unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
12493 SDValue Elt;
12494 if (Subtarget.isISAFuture()) {
12495 VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
12496 Elt = DAG.getNode(Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
12497 N1: Idx > 1 ? Value2 : Value,
12498 N2: DAG.getConstant(Val: VecNum, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
12499 } else
12500 Elt = DAG.getNode(Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8, N1: Value,
12501 N2: DAG.getConstant(Val: VecNum, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
12502
12503 SDValue Store =
12504 DAG.getStore(Chain: StoreChain, dl, Val: Elt, Ptr: BasePtr,
12505 PtrInfo: SN->getPointerInfo().getWithOffset(O: Idx * 16),
12506 Alignment: commonAlignment(A: Alignment, Offset: Idx * 16),
12507 MMOFlags: SN->getMemOperand()->getFlags(), AAInfo: SN->getAAInfo());
12508 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12509 N2: DAG.getConstant(Val: 16, DL: dl, VT: BasePtr.getValueType()));
12510 Stores.push_back(Elt: Store);
12511 }
12512 SDValue TF = DAG.getTokenFactor(DL: dl, Vals&: Stores);
12513 return TF;
12514}
12515
12516SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
12517 SDLoc dl(Op);
12518 if (Op.getValueType() == MVT::v4i32) {
12519 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
12520
12521 SDValue Zero = getCanonicalConstSplat(Val: 0, SplatSize: 1, VT: MVT::v4i32, DAG, dl);
12522 // +16 as shift amt.
12523 SDValue Neg16 = getCanonicalConstSplat(Val: -16, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
12524 SDValue RHSSwap = // = vrlw RHS, 16
12525 BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vrlw, LHS: RHS, RHS: Neg16, DAG, dl);
12526
12527 // Shrinkify inputs to v8i16.
12528 LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: LHS);
12529 RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: RHS);
12530 RHSSwap = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: RHSSwap);
12531
12532 // Low parts multiplied together, generating 32-bit results (we ignore the
12533 // top parts).
12534 SDValue LoProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmulouh,
12535 LHS, RHS, DAG, dl, DestVT: MVT::v4i32);
12536
12537 SDValue HiProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmsumuhm,
12538 Op0: LHS, Op1: RHSSwap, Op2: Zero, DAG, dl, DestVT: MVT::v4i32);
12539 // Shift the high parts up 16 bits.
12540 HiProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vslw, LHS: HiProd,
12541 RHS: Neg16, DAG, dl);
12542 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::v4i32, N1: LoProd, N2: HiProd);
12543 } else if (Op.getValueType() == MVT::v16i8) {
12544 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
12545 bool isLittleEndian = Subtarget.isLittleEndian();
12546
12547 // Multiply the even 8-bit parts, producing 16-bit sums.
12548 SDValue EvenParts = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmuleub,
12549 LHS, RHS, DAG, dl, DestVT: MVT::v8i16);
12550 EvenParts = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: EvenParts);
12551
12552 // Multiply the odd 8-bit parts, producing 16-bit sums.
12553 SDValue OddParts = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmuloub,
12554 LHS, RHS, DAG, dl, DestVT: MVT::v8i16);
12555 OddParts = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OddParts);
12556
12557 // Merge the results together. Because vmuleub and vmuloub are
12558 // instructions with a big-endian bias, we must reverse the
12559 // element numbering and reverse the meaning of "odd" and "even"
12560 // when generating little endian code.
12561 int Ops[16];
12562 for (unsigned i = 0; i != 8; ++i) {
12563 if (isLittleEndian) {
12564 Ops[i*2 ] = 2*i;
12565 Ops[i*2+1] = 2*i+16;
12566 } else {
12567 Ops[i*2 ] = 2*i+1;
12568 Ops[i*2+1] = 2*i+1+16;
12569 }
12570 }
12571 if (isLittleEndian)
12572 return DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: OddParts, N2: EvenParts, Mask: Ops);
12573 else
12574 return DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: EvenParts, N2: OddParts, Mask: Ops);
12575 } else {
12576 llvm_unreachable("Unknown mul to lower!");
12577 }
12578}
12579
12580SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
12581 bool IsStrict = Op->isStrictFPOpcode();
12582 if (Op.getOperand(i: IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
12583 !Subtarget.hasP9Vector())
12584 return SDValue();
12585
12586 return Op;
12587}
12588
12589// Custom lowering for fpext vf32 to v2f64
12590SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
12591
12592 assert(Op.getOpcode() == ISD::FP_EXTEND &&
12593 "Should only be called for ISD::FP_EXTEND");
12594
12595 // FIXME: handle extends from half precision float vectors on P9.
12596 // We only want to custom lower an extend from v2f32 to v2f64.
12597 if (Op.getValueType() != MVT::v2f64 ||
12598 Op.getOperand(i: 0).getValueType() != MVT::v2f32)
12599 return SDValue();
12600
12601 SDLoc dl(Op);
12602 SDValue Op0 = Op.getOperand(i: 0);
12603
12604 switch (Op0.getOpcode()) {
12605 default:
12606 return SDValue();
12607 case ISD::EXTRACT_SUBVECTOR: {
12608 assert(Op0.getNumOperands() == 2 &&
12609 isa<ConstantSDNode>(Op0->getOperand(1)) &&
12610 "Node should have 2 operands with second one being a constant!");
12611
12612 if (Op0.getOperand(i: 0).getValueType() != MVT::v4f32)
12613 return SDValue();
12614
12615 // Custom lower is only done for high or low doubleword.
12616 int Idx = Op0.getConstantOperandVal(i: 1);
12617 if (Idx % 2 != 0)
12618 return SDValue();
12619
12620 // Since input is v4f32, at this point Idx is either 0 or 2.
12621 // Shift to get the doubleword position we want.
12622 int DWord = Idx >> 1;
12623
12624 // High and low word positions are different on little endian.
12625 if (Subtarget.isLittleEndian())
12626 DWord ^= 0x1;
12627
12628 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64,
12629 N1: Op0.getOperand(i: 0), N2: DAG.getConstant(Val: DWord, DL: dl, VT: MVT::i32));
12630 }
12631 case ISD::FADD:
12632 case ISD::FMUL:
12633 case ISD::FSUB: {
12634 SDValue NewLoad[2];
12635 for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
12636 // Ensure both input are loads.
12637 SDValue LdOp = Op0.getOperand(i);
12638 if (LdOp.getOpcode() != ISD::LOAD)
12639 return SDValue();
12640 // Generate new load node.
12641 LoadSDNode *LD = cast<LoadSDNode>(Val&: LdOp);
12642 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
12643 NewLoad[i] = DAG.getMemIntrinsicNode(
12644 Opcode: PPCISD::LD_VSX_LH, dl, VTList: DAG.getVTList(VT1: MVT::v4f32, VT2: MVT::Other), Ops: LoadOps,
12645 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
12646 }
12647 SDValue NewOp =
12648 DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc(Op0), VT: MVT::v4f32, N1: NewLoad[0],
12649 N2: NewLoad[1], Flags: Op0.getNode()->getFlags());
12650 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64, N1: NewOp,
12651 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
12652 }
12653 case ISD::LOAD: {
12654 LoadSDNode *LD = cast<LoadSDNode>(Val&: Op0);
12655 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
12656 SDValue NewLd = DAG.getMemIntrinsicNode(
12657 Opcode: PPCISD::LD_VSX_LH, dl, VTList: DAG.getVTList(VT1: MVT::v4f32, VT2: MVT::Other), Ops: LoadOps,
12658 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
12659 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64, N1: NewLd,
12660 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
12661 }
12662 }
12663 llvm_unreachable("ERROR:Should return for all cases within swtich.");
12664}
12665
12666static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value,
12667 SelectionDAG &DAG,
12668 const PPCSubtarget &STI) {
12669 SDLoc DL(Value);
12670 if (STI.useCRBits())
12671 Value = DAG.getNode(Opcode: ISD::SELECT, DL, VT: SumType, N1: Value,
12672 N2: DAG.getConstant(Val: 1, DL, VT: SumType),
12673 N3: DAG.getConstant(Val: 0, DL, VT: SumType));
12674 else
12675 Value = DAG.getZExtOrTrunc(Op: Value, DL, VT: SumType);
12676 SDValue Sum = DAG.getNode(Opcode: PPCISD::ADDC, DL, VTList: DAG.getVTList(VT1: SumType, VT2: MVT::i32),
12677 N1: Value, N2: DAG.getAllOnesConstant(DL, VT: SumType));
12678 return Sum.getValue(R: 1);
12679}
12680
12681static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag,
12682 EVT CarryType, SelectionDAG &DAG,
12683 const PPCSubtarget &STI) {
12684 SDLoc DL(Flag);
12685 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: SumType);
12686 SDValue Carry = DAG.getNode(
12687 Opcode: PPCISD::ADDE, DL, VTList: DAG.getVTList(VT1: SumType, VT2: MVT::i32), N1: Zero, N2: Zero, N3: Flag);
12688 if (STI.useCRBits())
12689 return DAG.getSetCC(DL, VT: CarryType, LHS: Carry, RHS: Zero, Cond: ISD::SETNE);
12690 return DAG.getZExtOrTrunc(Op: Carry, DL, VT: CarryType);
12691}
12692
12693SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const {
12694
12695 SDLoc DL(Op);
12696 SDNode *N = Op.getNode();
12697 EVT VT = N->getValueType(ResNo: 0);
12698 EVT CarryType = N->getValueType(ResNo: 1);
12699 unsigned Opc = N->getOpcode();
12700 bool IsAdd = Opc == ISD::UADDO;
12701 Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC;
12702 SDValue Sum = DAG.getNode(Opcode: Opc, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
12703 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
12704 SDValue Carry = ConvertCarryFlagToCarryValue(SumType: VT, Flag: Sum.getValue(R: 1), CarryType,
12705 DAG, STI: Subtarget);
12706 if (!IsAdd)
12707 Carry = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryType, N1: Carry,
12708 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryType));
12709 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Sum, N2: Carry);
12710}
12711
12712SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op,
12713 SelectionDAG &DAG) const {
12714 SDLoc DL(Op);
12715 SDNode *N = Op.getNode();
12716 unsigned Opc = N->getOpcode();
12717 EVT VT = N->getValueType(ResNo: 0);
12718 EVT CarryType = N->getValueType(ResNo: 1);
12719 SDValue CarryOp = N->getOperand(Num: 2);
12720 bool IsAdd = Opc == ISD::UADDO_CARRY;
12721 Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE;
12722 if (!IsAdd)
12723 CarryOp = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryOp.getValueType(), N1: CarryOp,
12724 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryOp.getValueType()));
12725 CarryOp = ConvertCarryValueToCarryFlag(SumType: VT, Value: CarryOp, DAG, STI: Subtarget);
12726 SDValue Sum = DAG.getNode(Opcode: Opc, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
12727 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: CarryOp);
12728 CarryOp = ConvertCarryFlagToCarryValue(SumType: VT, Flag: Sum.getValue(R: 1), CarryType, DAG,
12729 STI: Subtarget);
12730 if (!IsAdd)
12731 CarryOp = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryOp.getValueType(), N1: CarryOp,
12732 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryOp.getValueType()));
12733 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Sum, N2: CarryOp);
12734}
12735
12736SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const {
12737
12738 SDLoc dl(Op);
12739 SDValue LHS = Op.getOperand(i: 0);
12740 SDValue RHS = Op.getOperand(i: 1);
12741 EVT VT = Op.getNode()->getValueType(ResNo: 0);
12742
12743 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LHS, N2: RHS);
12744
12745 SDValue Xor1 = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: RHS, N2: LHS);
12746 SDValue Xor2 = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: Sub, N2: LHS);
12747
12748 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: Xor1, N2: Xor2);
12749
12750 SDValue Overflow =
12751 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: And,
12752 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT: MVT::i32));
12753
12754 SDValue OverflowTrunc =
12755 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: Op.getNode()->getValueType(ResNo: 1), Operand: Overflow);
12756
12757 return DAG.getMergeValues(Ops: {Sub, OverflowTrunc}, dl);
12758}
12759
12760/// Implements signed add with overflow detection using the rule:
12761/// (x eqv y) & (sum xor x), where the overflow bit is extracted from the sign
12762SDValue PPCTargetLowering::LowerSADDO(SDValue Op, SelectionDAG &DAG) const {
12763
12764 SDLoc dl(Op);
12765 SDValue LHS = Op.getOperand(i: 0);
12766 SDValue RHS = Op.getOperand(i: 1);
12767 EVT VT = Op.getNode()->getValueType(ResNo: 0);
12768
12769 SDValue Sum = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: LHS, N2: RHS);
12770
12771 // Compute ~(x xor y)
12772 SDValue XorXY = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: LHS, N2: RHS);
12773 SDValue EqvXY = DAG.getNOT(DL: dl, Val: XorXY, VT);
12774 // Compute (s xor x)
12775 SDValue SumXorX = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: Sum, N2: LHS);
12776
12777 // overflow = (x eqv y) & (s xor x)
12778 SDValue OverflowInSign = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: EqvXY, N2: SumXorX);
12779
12780 // Shift sign bit down to LSB
12781 SDValue Overflow =
12782 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: OverflowInSign,
12783 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT: MVT::i32));
12784 // Truncate to the overflow type (i1)
12785 SDValue OverflowTrunc =
12786 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: Op.getNode()->getValueType(ResNo: 1), Operand: Overflow);
12787
12788 return DAG.getMergeValues(Ops: {Sum, OverflowTrunc}, dl);
12789}
12790
12791// Lower unsigned 3-way compare producing -1/0/1.
12792SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
12793 SDLoc DL(Op);
12794 SDValue A = DAG.getFreeze(V: Op.getOperand(i: 0));
12795 SDValue B = DAG.getFreeze(V: Op.getOperand(i: 1));
12796 EVT OpVT = A.getValueType();
12797 EVT ResVT = Op.getValueType();
12798
12799 // On PPC64, i32 carries are affected by the upper 32 bits of the registers.
12800 // We must zero-extend to i64 to ensure the carry reflects the 32-bit unsigned
12801 // comparison.
12802 if (Subtarget.isPPC64() && OpVT != MVT::i64) {
12803 A = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: A);
12804 B = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: B);
12805 OpVT = MVT::i64;
12806 }
12807
12808 // First compute diff = A - B.
12809 SDValue Diff = DAG.getNode(Opcode: ISD::SUB, DL, VT: OpVT, N1: A, N2: B);
12810
12811 // Generate B - A using SUBC to capture carry.
12812 SDVTList VTs = DAG.getVTList(VT1: OpVT, VT2: MVT::i32);
12813 SDValue SubC = DAG.getNode(Opcode: PPCISD::SUBC, DL, VTList: VTs, N1: B, N2: A);
12814 SDValue CA0 = SubC.getValue(R: 1);
12815
12816 // t2 = A - B + CA0 using SUBE.
12817 SDValue SubE1 = DAG.getNode(Opcode: PPCISD::SUBE, DL, VTList: VTs, N1: A, N2: B, N3: CA0);
12818 SDValue CA1 = SubE1.getValue(R: 1);
12819
12820 // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1).
12821 SDValue ResPair = DAG.getNode(Opcode: PPCISD::SUBE, DL, VTList: VTs, N1: Diff, N2: SubE1, N3: CA1);
12822
12823 // Extract the first result and truncate to result type if needed.
12824 return DAG.getSExtOrTrunc(Op: ResPair.getValue(R: 0), DL, VT: ResVT);
12825}
12826
12827/// LowerOperation - Provide custom lowering hooks for some operations.
12828///
12829SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
12830 switch (Op.getOpcode()) {
12831 default:
12832 llvm_unreachable("Wasn't expecting to be able to lower this!");
12833 case ISD::FPOW: return lowerPow(Op, DAG);
12834 case ISD::FSIN: return lowerSin(Op, DAG);
12835 case ISD::FCOS: return lowerCos(Op, DAG);
12836 case ISD::FLOG: return lowerLog(Op, DAG);
12837 case ISD::FLOG10: return lowerLog10(Op, DAG);
12838 case ISD::FEXP: return lowerExp(Op, DAG);
12839 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
12840 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
12841 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
12842 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
12843 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
12844 case ISD::STRICT_FSETCC:
12845 case ISD::STRICT_FSETCCS:
12846 case ISD::SETCC: return LowerSETCC(Op, DAG);
12847 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
12848 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
12849 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
12850 case ISD::SSUBO:
12851 return LowerSSUBO(Op, DAG);
12852 case ISD::SADDO:
12853 return LowerSADDO(Op, DAG);
12854
12855 case ISD::INLINEASM:
12856 case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG);
12857 // Variable argument lowering.
12858 case ISD::VASTART: return LowerVASTART(Op, DAG);
12859 case ISD::VAARG: return LowerVAARG(Op, DAG);
12860 case ISD::VACOPY: return LowerVACOPY(Op, DAG);
12861
12862 case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
12863 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
12864 case ISD::GET_DYNAMIC_AREA_OFFSET:
12865 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
12866
12867 // Exception handling lowering.
12868 case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
12869 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
12870 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
12871
12872 case ISD::LOAD: return LowerLOAD(Op, DAG);
12873 case ISD::STORE: return LowerSTORE(Op, DAG);
12874 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
12875 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
12876 case ISD::STRICT_FP_TO_UINT:
12877 case ISD::STRICT_FP_TO_SINT:
12878 case ISD::FP_TO_UINT:
12879 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, dl: SDLoc(Op));
12880 case ISD::STRICT_UINT_TO_FP:
12881 case ISD::STRICT_SINT_TO_FP:
12882 case ISD::UINT_TO_FP:
12883 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
12884 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
12885 case ISD::SET_ROUNDING:
12886 return LowerSET_ROUNDING(Op, DAG);
12887
12888 // Lower 64-bit shifts.
12889 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
12890 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
12891 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
12892
12893 case ISD::FSHL: return LowerFunnelShift(Op, DAG);
12894 case ISD::FSHR: return LowerFunnelShift(Op, DAG);
12895
12896 // Vector-related lowering.
12897 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
12898 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
12899 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
12900 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
12901 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
12902 case ISD::MUL: return LowerMUL(Op, DAG);
12903 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
12904 case ISD::STRICT_FP_ROUND:
12905 case ISD::FP_ROUND:
12906 return LowerFP_ROUND(Op, DAG);
12907 case ISD::ROTL: return LowerROTL(Op, DAG);
12908
12909 // For counter-based loop handling.
12910 case ISD::INTRINSIC_W_CHAIN:
12911 return SDValue();
12912
12913 case ISD::BITCAST: return LowerBITCAST(Op, DAG);
12914
12915 // Frame & Return address.
12916 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
12917 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
12918
12919 case ISD::INTRINSIC_VOID:
12920 return LowerINTRINSIC_VOID(Op, DAG);
12921 case ISD::BSWAP:
12922 return LowerBSWAP(Op, DAG);
12923 case ISD::ATOMIC_CMP_SWAP:
12924 return LowerATOMIC_CMP_SWAP(Op, DAG);
12925 case ISD::ATOMIC_STORE:
12926 return LowerATOMIC_LOAD_STORE(Op, DAG);
12927 case ISD::IS_FPCLASS:
12928 return LowerIS_FPCLASS(Op, DAG);
12929 case ISD::UADDO:
12930 case ISD::USUBO:
12931 return LowerADDSUBO(Op, DAG);
12932 case ISD::UADDO_CARRY:
12933 case ISD::USUBO_CARRY:
12934 return LowerADDSUBO_CARRY(Op, DAG);
12935 case ISD::UCMP:
12936 return LowerUCMP(Op, DAG);
12937 case ISD::STRICT_LRINT:
12938 case ISD::STRICT_LLRINT:
12939 case ISD::STRICT_LROUND:
12940 case ISD::STRICT_LLROUND:
12941 case ISD::STRICT_FNEARBYINT:
12942 if (Op->getFlags().hasNoFPExcept())
12943 return Op;
12944 return SDValue();
12945 case ISD::VP_LOAD:
12946 return LowerVP_LOAD(Op, DAG);
12947 case ISD::VP_STORE:
12948 return LowerVP_STORE(Op, DAG);
12949 case ISD::PARTIAL_REDUCE_UMLA:
12950 return LowerPartialReduce(Op, DAG);
12951 }
12952}
12953
12954void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
12955 SmallVectorImpl<SDValue>&Results,
12956 SelectionDAG &DAG) const {
12957 SDLoc dl(N);
12958 switch (N->getOpcode()) {
12959 default:
12960 llvm_unreachable("Do not know how to custom type legalize this operation!");
12961 case ISD::ATOMIC_LOAD: {
12962 SDValue Res = LowerATOMIC_LOAD_STORE(Op: SDValue(N, 0), DAG);
12963 Results.push_back(Elt: Res);
12964 Results.push_back(Elt: Res.getValue(R: 1));
12965 break;
12966 }
12967 case ISD::READCYCLECOUNTER: {
12968 SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other);
12969 SDValue RTB = DAG.getNode(Opcode: PPCISD::READ_TIME_BASE, DL: dl, VTList: VTs, N: N->getOperand(Num: 0));
12970
12971 Results.push_back(
12972 Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: RTB, N2: RTB.getValue(R: 1)));
12973 Results.push_back(Elt: RTB.getValue(R: 2));
12974 break;
12975 }
12976 case ISD::INTRINSIC_W_CHAIN: {
12977 if (N->getConstantOperandVal(Num: 1) != Intrinsic::loop_decrement)
12978 break;
12979
12980 assert(N->getValueType(0) == MVT::i1 &&
12981 "Unexpected result type for CTR decrement intrinsic");
12982 EVT SVT = getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(),
12983 VT: N->getValueType(ResNo: 0));
12984 SDVTList VTs = DAG.getVTList(VT1: SVT, VT2: MVT::Other);
12985 SDValue NewInt = DAG.getNode(Opcode: N->getOpcode(), DL: dl, VTList: VTs, N1: N->getOperand(Num: 0),
12986 N2: N->getOperand(Num: 1));
12987
12988 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: NewInt));
12989 Results.push_back(Elt: NewInt.getValue(R: 1));
12990 break;
12991 }
12992 case ISD::INTRINSIC_WO_CHAIN: {
12993 switch (N->getConstantOperandVal(Num: 0)) {
12994 case Intrinsic::ppc_pack_longdouble:
12995 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::ppcf128,
12996 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 1)));
12997 break;
12998 case Intrinsic::ppc_maxfe:
12999 case Intrinsic::ppc_minfe:
13000 case Intrinsic::ppc_fnmsub:
13001 case Intrinsic::ppc_convert_f128_to_ppcf128:
13002 Results.push_back(Elt: LowerINTRINSIC_WO_CHAIN(Op: SDValue(N, 0), DAG));
13003 break;
13004 }
13005 break;
13006 }
13007 case ISD::VAARG: {
13008 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
13009 return;
13010
13011 EVT VT = N->getValueType(ResNo: 0);
13012
13013 if (VT == MVT::i64) {
13014 SDValue NewNode = LowerVAARG(Op: SDValue(N, 1), DAG);
13015
13016 Results.push_back(Elt: NewNode);
13017 Results.push_back(Elt: NewNode.getValue(R: 1));
13018 }
13019 return;
13020 }
13021 case ISD::STRICT_FP_TO_SINT:
13022 case ISD::STRICT_FP_TO_UINT:
13023 case ISD::FP_TO_SINT:
13024 case ISD::FP_TO_UINT: {
13025 // LowerFP_TO_INT() can only handle f32 and f64.
13026 if (N->getOperand(Num: N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
13027 MVT::ppcf128)
13028 return;
13029 SDValue LoweredValue = LowerFP_TO_INT(Op: SDValue(N, 0), DAG, dl);
13030 Results.push_back(Elt: LoweredValue);
13031 if (N->isStrictFPOpcode())
13032 Results.push_back(Elt: LoweredValue.getValue(R: 1));
13033 return;
13034 }
13035 case ISD::TRUNCATE: {
13036 if (!N->getValueType(ResNo: 0).isVector())
13037 return;
13038 SDValue Lowered = LowerTRUNCATEVector(Op: SDValue(N, 0), DAG);
13039 if (Lowered)
13040 Results.push_back(Elt: Lowered);
13041 return;
13042 }
13043 case ISD::SCALAR_TO_VECTOR: {
13044 SDValue Lowered = LowerSCALAR_TO_VECTOR(Op: SDValue(N, 0), DAG);
13045 if (Lowered)
13046 Results.push_back(Elt: Lowered);
13047 return;
13048 }
13049 case ISD::FSHL:
13050 case ISD::FSHR:
13051 // Don't handle funnel shifts here.
13052 return;
13053 case ISD::BITCAST:
13054 // Don't handle bitcast here.
13055 return;
13056 case ISD::FP_EXTEND:
13057 SDValue Lowered = LowerFP_EXTEND(Op: SDValue(N, 0), DAG);
13058 if (Lowered)
13059 Results.push_back(Elt: Lowered);
13060 return;
13061 }
13062}
13063
13064//===----------------------------------------------------------------------===//
13065// Other Lowering Code
13066//===----------------------------------------------------------------------===//
13067
13068static CallInst *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
13069 return Builder.CreateIntrinsicWithoutFolding(ID: Id, Args: {});
13070}
13071
13072Value *PPCTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
13073 Value *Addr,
13074 AtomicOrdering Ord) const {
13075 unsigned SZ = ValueTy->getPrimitiveSizeInBits();
13076
13077 assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
13078 "Only 8/16/32/64-bit atomic loads supported");
13079 Intrinsic::ID IntID;
13080 switch (SZ) {
13081 default:
13082 llvm_unreachable("Unexpected PrimitiveSize");
13083 case 8:
13084 IntID = Intrinsic::ppc_lbarx;
13085 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
13086 break;
13087 case 16:
13088 IntID = Intrinsic::ppc_lharx;
13089 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
13090 break;
13091 case 32:
13092 IntID = Intrinsic::ppc_lwarx;
13093 break;
13094 case 64:
13095 IntID = Intrinsic::ppc_ldarx;
13096 break;
13097 }
13098 Value *Call =
13099 Builder.CreateIntrinsic(ID: IntID, Args: Addr, /*FMFSource=*/nullptr, Name: "larx");
13100
13101 return Builder.CreateTruncOrBitCast(V: Call, DestTy: ValueTy);
13102}
13103
13104// Perform a store-conditional operation to Addr. Return the status of the
13105// store. This should be 0 if the store succeeded, non-zero otherwise.
13106Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
13107 Value *Val, Value *Addr,
13108 AtomicOrdering Ord) const {
13109 Type *Ty = Val->getType();
13110 unsigned SZ = Ty->getPrimitiveSizeInBits();
13111
13112 assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
13113 "Only 8/16/32/64-bit atomic loads supported");
13114 Intrinsic::ID IntID;
13115 switch (SZ) {
13116 default:
13117 llvm_unreachable("Unexpected PrimitiveSize");
13118 case 8:
13119 IntID = Intrinsic::ppc_stbcx;
13120 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
13121 break;
13122 case 16:
13123 IntID = Intrinsic::ppc_sthcx;
13124 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
13125 break;
13126 case 32:
13127 IntID = Intrinsic::ppc_stwcx;
13128 break;
13129 case 64:
13130 IntID = Intrinsic::ppc_stdcx;
13131 break;
13132 }
13133
13134 if (SZ == 8 || SZ == 16)
13135 Val = Builder.CreateZExt(V: Val, DestTy: Builder.getInt32Ty());
13136
13137 Value *Call = Builder.CreateIntrinsic(ID: IntID, Args: {Addr, Val},
13138 /*FMFSource=*/nullptr, Name: "stcx");
13139 return Builder.CreateXor(LHS: Call, RHS: Builder.getInt32(C: 1));
13140}
13141
13142// The mappings for emitLeading/TrailingFence is taken from
13143// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
13144Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
13145 Instruction *Inst,
13146 AtomicOrdering Ord) const {
13147 if (Ord == AtomicOrdering::SequentiallyConsistent)
13148 return callIntrinsic(Builder, Id: Intrinsic::ppc_sync);
13149 if (isReleaseOrStronger(AO: Ord))
13150 return callIntrinsic(Builder, Id: Intrinsic::ppc_lwsync);
13151 return nullptr;
13152}
13153
13154Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
13155 Instruction *Inst,
13156 AtomicOrdering Ord) const {
13157 if (Inst->hasAtomicLoad() && isAcquireOrStronger(AO: Ord)) {
13158 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
13159 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
13160 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
13161 if (isa<LoadInst>(Val: Inst))
13162 return Builder.CreateIntrinsicWithoutFolding(ID: Intrinsic::ppc_cfence,
13163 OverloadTypes: {Inst->getType()}, Args: {Inst});
13164 // FIXME: Can use isync for rmw operation.
13165 return callIntrinsic(Builder, Id: Intrinsic::ppc_lwsync);
13166 }
13167 return nullptr;
13168}
13169
13170MachineBasicBlock *PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI,
13171 MachineBasicBlock *BB,
13172 unsigned BinOpcode,
13173 unsigned CmpOpcode,
13174 unsigned CmpPred) const {
13175 // BinOpcode != 0: Handles atomic load with binary operator, e.g. NAND.
13176 // CmpOpcode != 0: Handles atomic load with MIN/MAX etc.
13177 // BinOpcode == 0 && CmpOpcode == 0: Handles ATOMIC_SWAP.
13178 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
13179 unsigned AtomicSize = MI.getOperand(i: 3).getImm();
13180
13181 auto LoadMnemonic = PPC::LDARX;
13182 auto StoreMnemonic = PPC::STDCX;
13183 switch (AtomicSize) {
13184 default:
13185 llvm_unreachable("Unexpected size of atomic entity");
13186 case 1:
13187 LoadMnemonic = PPC::LBARX;
13188 StoreMnemonic = PPC::STBCX;
13189 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
13190 break;
13191 case 2:
13192 LoadMnemonic = PPC::LHARX;
13193 StoreMnemonic = PPC::STHCX;
13194 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
13195 break;
13196 case 4:
13197 LoadMnemonic = PPC::LWARX;
13198 StoreMnemonic = PPC::STWCX;
13199 break;
13200 case 8:
13201 LoadMnemonic = PPC::LDARX;
13202 StoreMnemonic = PPC::STDCX;
13203 break;
13204 }
13205
13206 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13207 MachineFunction *F = BB->getParent();
13208 MachineFunction::iterator It = ++BB->getIterator();
13209
13210 if (CmpOpcode == PPC::CMPW && (AtomicSize == 1 || AtomicSize == 2))
13211 signExtendOperandIfUnknown(MI, BB, OpIdx: 4, /*IsByte=*/AtomicSize == 1, TII);
13212
13213 Register dest = MI.getOperand(i: 0).getReg();
13214 Register ptrA = MI.getOperand(i: 1).getReg();
13215 Register ptrB = MI.getOperand(i: 2).getReg();
13216 Register incr = MI.getOperand(i: 4).getReg();
13217 DebugLoc dl = MI.getDebugLoc();
13218
13219 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13220 MachineBasicBlock *loop2MBB =
13221 CmpOpcode ? F->CreateMachineBasicBlock(BB: LLVM_BB) : nullptr;
13222 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13223 F->insert(MBBI: It, MBB: loopMBB);
13224 if (CmpOpcode)
13225 F->insert(MBBI: It, MBB: loop2MBB);
13226 F->insert(MBBI: It, MBB: exitMBB);
13227 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
13228 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13229 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13230
13231 MachineRegisterInfo &RegInfo = F->getRegInfo();
13232 Register TmpReg = (!BinOpcode) ? incr :
13233 RegInfo.createVirtualRegister( RegClass: AtomicSize == 8 ? &PPC::G8RCRegClass
13234 : &PPC::GPRCRegClass);
13235
13236 // thisMBB:
13237 // ...
13238 // fallthrough --> loopMBB
13239 BB->addSuccessor(Succ: loopMBB);
13240
13241 // loopMBB:
13242 // l[wd]arx dest, ptr
13243 // add r0, dest, incr
13244 // st[wd]cx. r0, ptr
13245 // bne- loopMBB
13246 // fallthrough --> exitMBB
13247
13248 // For max/min...
13249 // loopMBB:
13250 // l[wd]arx dest, ptr
13251 // cmpl?[wd] dest, incr
13252 // bgt exitMBB
13253 // loop2MBB:
13254 // st[wd]cx. dest, ptr
13255 // bne- loopMBB
13256 // fallthrough --> exitMBB
13257
13258 BB = loopMBB;
13259 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: LoadMnemonic), DestReg: dest)
13260 .addReg(RegNo: ptrA).addReg(RegNo: ptrB);
13261 if (BinOpcode)
13262 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: BinOpcode), DestReg: TmpReg).addReg(RegNo: incr).addReg(RegNo: dest);
13263 if (CmpOpcode) {
13264 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13265 // Signed comparisons of byte or halfword values must be sign-extended.
13266 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
13267 Register ExtReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13268 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
13269 DestReg: ExtReg).addReg(RegNo: dest);
13270 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: ExtReg).addReg(RegNo: incr);
13271 } else
13272 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: dest).addReg(RegNo: incr);
13273
13274 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13275 .addImm(Val: CmpPred)
13276 .addReg(RegNo: CrReg)
13277 .addMBB(MBB: exitMBB);
13278 BB->addSuccessor(Succ: loop2MBB);
13279 BB->addSuccessor(Succ: exitMBB);
13280 BB = loop2MBB;
13281 }
13282 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: StoreMnemonic))
13283 .addReg(RegNo: TmpReg).addReg(RegNo: ptrA).addReg(RegNo: ptrB);
13284 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13285 .addImm(Val: PPC::PRED_NE_MINUS)
13286 .addReg(RegNo: PPC::CR0)
13287 .addMBB(MBB: loopMBB);
13288 BB->addSuccessor(Succ: loopMBB);
13289 BB->addSuccessor(Succ: exitMBB);
13290
13291 // exitMBB:
13292 // ...
13293 BB = exitMBB;
13294 return BB;
13295}
13296
13297static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
13298 switch(MI.getOpcode()) {
13299 default:
13300 return false;
13301 case PPC::COPY:
13302 return TII->isSignExtended(Reg: MI.getOperand(i: 1).getReg(),
13303 MRI: &MI.getMF()->getRegInfo());
13304 case PPC::LHA:
13305 case PPC::LHA8:
13306 case PPC::LHAU:
13307 case PPC::LHAU8:
13308 case PPC::LHAUX:
13309 case PPC::LHAUX8:
13310 case PPC::LHAX:
13311 case PPC::LHAX8:
13312 case PPC::LWA:
13313 case PPC::LWAUX:
13314 case PPC::LWAX:
13315 case PPC::LWAX_32:
13316 case PPC::LWA_32:
13317 case PPC::PLHA:
13318 case PPC::PLHA8:
13319 case PPC::PLHA8pc:
13320 case PPC::PLHApc:
13321 case PPC::PLWA:
13322 case PPC::PLWA8:
13323 case PPC::PLWA8pc:
13324 case PPC::PLWApc:
13325 case PPC::EXTSB:
13326 case PPC::EXTSB8:
13327 case PPC::EXTSB8_32_64:
13328 case PPC::EXTSB8_rec:
13329 case PPC::EXTSB_rec:
13330 case PPC::EXTSH:
13331 case PPC::EXTSH8:
13332 case PPC::EXTSH8_32_64:
13333 case PPC::EXTSH8_rec:
13334 case PPC::EXTSH_rec:
13335 case PPC::EXTSW:
13336 case PPC::EXTSWSLI:
13337 case PPC::EXTSWSLI_32_64:
13338 case PPC::EXTSWSLI_32_64_rec:
13339 case PPC::EXTSWSLI_rec:
13340 case PPC::EXTSW_32:
13341 case PPC::EXTSW_32_64:
13342 case PPC::EXTSW_32_64_rec:
13343 case PPC::EXTSW_rec:
13344 case PPC::SRAW:
13345 case PPC::SRAWI:
13346 case PPC::SRAWI_rec:
13347 case PPC::SRAW_rec:
13348 return true;
13349 }
13350 return false;
13351}
13352
13353// Sign extend operand OpIdx if the value is not known to be sign extended.
13354// Assumes the operand is a register. The flag IsByte controls which intruction
13355// is used for the sign extension.
13356static void signExtendOperandIfUnknown(MachineInstr &MI, MachineBasicBlock *BB,
13357 unsigned OpIdx, bool IsByte,
13358 const PPCInstrInfo *TII) {
13359 MachineFunction *F = MI.getMF();
13360 MachineRegisterInfo &RegInfo = F->getRegInfo();
13361 Register Reg = MI.getOperand(i: OpIdx).getReg();
13362 bool IsSignExtended =
13363 Reg.isVirtual() && isSignExtended(MI&: *RegInfo.getVRegDef(Reg), TII);
13364
13365 if (!IsSignExtended) {
13366 Register ValueReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13367 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(),
13368 MCID: TII->get(Opcode: IsByte ? PPC::EXTSB : PPC::EXTSH), DestReg: ValueReg)
13369 .addReg(RegNo: Reg);
13370 MI.getOperand(i: OpIdx).setReg(ValueReg);
13371 }
13372}
13373
13374MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
13375 MachineInstr &MI, MachineBasicBlock *BB, unsigned BinOpcode,
13376 unsigned CmpOpcode, unsigned CmpPred) const {
13377 // BinOpcode != 0: Handles atomic load with binary operator, e.g. NAND.
13378 // CmpOpcode != 0: Handles atomic load with MIN/MAX etc.
13379 // BinOpcode == 0 && CmpOpcode == 0: Handles ATOMIC_SWAP.
13380 assert(!Subtarget.hasPartwordAtomics() &&
13381 "Assumes that part-word atomics are not available");
13382 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
13383
13384 // If this is a signed comparison and the value being compared is not known
13385 // to be sign extended, sign extend it here.
13386 DebugLoc dl = MI.getDebugLoc();
13387 MachineFunction *F = BB->getParent();
13388 MachineRegisterInfo &RegInfo = F->getRegInfo();
13389 const bool is8bit = MI.getOperand(i: 3).getImm() == 1;
13390 if (CmpOpcode == PPC::CMPW)
13391 signExtendOperandIfUnknown(MI, BB, OpIdx: 4, IsByte: is8bit, TII);
13392 Register incr = MI.getOperand(i: 4).getReg();
13393
13394 // In 64 bit mode we have to use 64 bits for addresses, even though the
13395 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
13396 // registers without caring whether they're 32 or 64, but here we're
13397 // doing actual arithmetic on the addresses.
13398 bool is64bit = Subtarget.isPPC64();
13399 bool isLittleEndian = Subtarget.isLittleEndian();
13400 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
13401
13402 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13403 MachineFunction::iterator It = ++BB->getIterator();
13404
13405 Register dest = MI.getOperand(i: 0).getReg();
13406 Register ptrA = MI.getOperand(i: 1).getReg();
13407 Register ptrB = MI.getOperand(i: 2).getReg();
13408
13409 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13410 MachineBasicBlock *loop2MBB =
13411 CmpOpcode ? F->CreateMachineBasicBlock(BB: LLVM_BB) : nullptr;
13412 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13413 F->insert(MBBI: It, MBB: loopMBB);
13414 if (CmpOpcode)
13415 F->insert(MBBI: It, MBB: loop2MBB);
13416 F->insert(MBBI: It, MBB: exitMBB);
13417 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
13418 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13419 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13420
13421 const TargetRegisterClass *RC =
13422 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13423 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13424
13425 Register PtrReg = RegInfo.createVirtualRegister(RegClass: RC);
13426 Register Shift1Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13427 Register ShiftReg =
13428 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
13429 Register Incr2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13430 Register MaskReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13431 Register Mask2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13432 Register Mask3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13433 Register Tmp2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13434 Register Tmp3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13435 Register Tmp4Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13436 Register TmpDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13437 Register SrwDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13438 Register Ptr1Reg;
13439 Register TmpReg =
13440 (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
13441
13442 // thisMBB:
13443 // ...
13444 // fallthrough --> loopMBB
13445 BB->addSuccessor(Succ: loopMBB);
13446
13447 // The 4-byte load must be aligned, while a char or short may be
13448 // anywhere in the word. Hence all this nasty bookkeeping code.
13449 // add ptr1, ptrA, ptrB [copy if ptrA==0]
13450 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
13451 // xori shift, shift1, 24 [16]
13452 // rlwinm ptr, ptr1, 0, 0, 29
13453 // slw incr2, incr, shift
13454 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
13455 // slw mask, mask2, shift
13456 // loopMBB:
13457 // lwarx tmpDest, ptr
13458 // add tmp, tmpDest, incr2
13459 // andc tmp2, tmpDest, mask
13460 // and tmp3, tmp, mask
13461 // or tmp4, tmp3, tmp2
13462 // stwcx. tmp4, ptr
13463 // bne- loopMBB
13464 // fallthrough --> exitMBB
13465 // srw SrwDest, tmpDest, shift
13466 // rlwinm SrwDest, SrwDest, 0, 24 [16], 31
13467 if (ptrA != ZeroReg) {
13468 Ptr1Reg = RegInfo.createVirtualRegister(RegClass: RC);
13469 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::ADD8 : PPC::ADD4), DestReg: Ptr1Reg)
13470 .addReg(RegNo: ptrA)
13471 .addReg(RegNo: ptrB);
13472 } else {
13473 Ptr1Reg = ptrB;
13474 }
13475 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
13476 // mode.
13477 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: Shift1Reg)
13478 .addReg(RegNo: Ptr1Reg, Flags: {}, SubReg: is64bit ? PPC::sub_32 : 0)
13479 .addImm(Val: 3)
13480 .addImm(Val: 27)
13481 .addImm(Val: is8bit ? 28 : 27);
13482 if (!isLittleEndian)
13483 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::XORI), DestReg: ShiftReg)
13484 .addReg(RegNo: Shift1Reg)
13485 .addImm(Val: is8bit ? 24 : 16);
13486 if (is64bit)
13487 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDICR), DestReg: PtrReg)
13488 .addReg(RegNo: Ptr1Reg)
13489 .addImm(Val: 0)
13490 .addImm(Val: 61);
13491 else
13492 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: PtrReg)
13493 .addReg(RegNo: Ptr1Reg)
13494 .addImm(Val: 0)
13495 .addImm(Val: 0)
13496 .addImm(Val: 29);
13497 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: Incr2Reg).addReg(RegNo: incr).addReg(RegNo: ShiftReg);
13498 if (is8bit)
13499 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask2Reg).addImm(Val: 255);
13500 else {
13501 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask3Reg).addImm(Val: 0);
13502 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ORI), DestReg: Mask2Reg)
13503 .addReg(RegNo: Mask3Reg)
13504 .addImm(Val: 65535);
13505 }
13506 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: MaskReg)
13507 .addReg(RegNo: Mask2Reg)
13508 .addReg(RegNo: ShiftReg);
13509
13510 BB = loopMBB;
13511 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LWARX), DestReg: TmpDestReg)
13512 .addReg(RegNo: ZeroReg)
13513 .addReg(RegNo: PtrReg);
13514 if (BinOpcode)
13515 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: BinOpcode), DestReg: TmpReg)
13516 .addReg(RegNo: Incr2Reg)
13517 .addReg(RegNo: TmpDestReg);
13518 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ANDC), DestReg: Tmp2Reg)
13519 .addReg(RegNo: TmpDestReg)
13520 .addReg(RegNo: MaskReg);
13521 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: Tmp3Reg).addReg(RegNo: TmpReg).addReg(RegNo: MaskReg);
13522 if (CmpOpcode) {
13523 // For unsigned comparisons, we can directly compare the shifted values.
13524 // For signed comparisons we shift and sign extend.
13525 Register SReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13526 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13527 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: SReg)
13528 .addReg(RegNo: TmpDestReg)
13529 .addReg(RegNo: MaskReg);
13530 unsigned ValueReg = SReg;
13531 unsigned CmpReg = Incr2Reg;
13532 if (CmpOpcode == PPC::CMPW) {
13533 ValueReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13534 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: ValueReg)
13535 .addReg(RegNo: SReg)
13536 .addReg(RegNo: ShiftReg);
13537 Register ValueSReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13538 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is8bit ? PPC::EXTSB : PPC::EXTSH), DestReg: ValueSReg)
13539 .addReg(RegNo: ValueReg);
13540 ValueReg = ValueSReg;
13541 CmpReg = incr;
13542 }
13543 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: ValueReg).addReg(RegNo: CmpReg);
13544 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13545 .addImm(Val: CmpPred)
13546 .addReg(RegNo: CrReg)
13547 .addMBB(MBB: exitMBB);
13548 BB->addSuccessor(Succ: loop2MBB);
13549 BB->addSuccessor(Succ: exitMBB);
13550 BB = loop2MBB;
13551 }
13552 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::OR), DestReg: Tmp4Reg).addReg(RegNo: Tmp3Reg).addReg(RegNo: Tmp2Reg);
13553 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::STWCX))
13554 .addReg(RegNo: Tmp4Reg)
13555 .addReg(RegNo: ZeroReg)
13556 .addReg(RegNo: PtrReg);
13557 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13558 .addImm(Val: PPC::PRED_NE_MINUS)
13559 .addReg(RegNo: PPC::CR0)
13560 .addMBB(MBB: loopMBB);
13561 BB->addSuccessor(Succ: loopMBB);
13562 BB->addSuccessor(Succ: exitMBB);
13563
13564 // exitMBB:
13565 // ...
13566 BB = exitMBB;
13567 // Since the shift amount is not a constant, we need to clear
13568 // the upper bits with a separate RLWINM.
13569 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: dest)
13570 .addReg(RegNo: SrwDestReg)
13571 .addImm(Val: 0)
13572 .addImm(Val: is8bit ? 24 : 16)
13573 .addImm(Val: 31);
13574 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: SrwDestReg)
13575 .addReg(RegNo: TmpDestReg)
13576 .addReg(RegNo: ShiftReg);
13577 return BB;
13578}
13579
13580llvm::MachineBasicBlock *
13581PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
13582 MachineBasicBlock *MBB) const {
13583 DebugLoc DL = MI.getDebugLoc();
13584 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13585 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
13586
13587 MachineFunction *MF = MBB->getParent();
13588 MachineRegisterInfo &MRI = MF->getRegInfo();
13589
13590 const BasicBlock *BB = MBB->getBasicBlock();
13591 MachineFunction::iterator I = ++MBB->getIterator();
13592
13593 Register DstReg = MI.getOperand(i: 0).getReg();
13594 const TargetRegisterClass *RC = MRI.getRegClass(Reg: DstReg);
13595 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
13596 Register mainDstReg = MRI.createVirtualRegister(RegClass: RC);
13597 Register restoreDstReg = MRI.createVirtualRegister(RegClass: RC);
13598
13599 MVT PVT = getPointerTy(DL: MF->getDataLayout());
13600 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13601 "Invalid Pointer Size!");
13602 // For v = setjmp(buf), we generate
13603 //
13604 // thisMBB:
13605 // SjLjSetup mainMBB
13606 // bl mainMBB
13607 // v_restore = 1
13608 // b sinkMBB
13609 //
13610 // mainMBB:
13611 // buf[LabelOffset] = LR
13612 // v_main = 0
13613 //
13614 // sinkMBB:
13615 // v = phi(main, restore)
13616 //
13617
13618 MachineBasicBlock *thisMBB = MBB;
13619 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13620 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13621 MF->insert(MBBI: I, MBB: mainMBB);
13622 MF->insert(MBBI: I, MBB: sinkMBB);
13623
13624 MachineInstrBuilder MIB;
13625
13626 // Transfer the remainder of BB and its successor edges to sinkMBB.
13627 sinkMBB->splice(Where: sinkMBB->begin(), Other: MBB,
13628 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
13629 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
13630
13631 // Note that the structure of the jmp_buf used here is not compatible
13632 // with that used by libc, and is not designed to be. Specifically, it
13633 // stores only those 'reserved' registers that LLVM does not otherwise
13634 // understand how to spill. Also, by convention, by the time this
13635 // intrinsic is called, Clang has already stored the frame address in the
13636 // first slot of the buffer and stack address in the third. Following the
13637 // X86 target code, we'll store the jump address in the second slot. We also
13638 // need to save the TOC pointer (R2) to handle jumps between shared
13639 // libraries, and that will be stored in the fourth slot. The thread
13640 // identifier (R13) is not affected.
13641
13642 // thisMBB:
13643 const int64_t LabelOffset = 1 * PVT.getStoreSize();
13644 const int64_t TOCOffset = 3 * PVT.getStoreSize();
13645 const int64_t BPOffset = 4 * PVT.getStoreSize();
13646
13647 // Prepare IP either in reg.
13648 const TargetRegisterClass *PtrRC = getRegClassFor(VT: PVT);
13649 Register LabelReg = MRI.createVirtualRegister(RegClass: PtrRC);
13650 Register BufReg = MI.getOperand(i: 1).getReg();
13651
13652 if (Subtarget.is64BitELFABI()) {
13653 setUsesTOCBasePtr(*MBB->getParent());
13654 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::STD))
13655 .addReg(RegNo: PPC::X2)
13656 .addImm(Val: TOCOffset)
13657 .addReg(RegNo: BufReg)
13658 .cloneMemRefs(OtherMI: MI);
13659 }
13660
13661 // Naked functions never have a base pointer, and so we use r1. For all
13662 // other functions, this decision must be delayed until during PEI.
13663 unsigned BaseReg;
13664 if (MF->getFunction().hasFnAttribute(Kind: Attribute::Naked))
13665 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
13666 else
13667 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
13668
13669 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL,
13670 MCID: TII->get(Opcode: Subtarget.isPPC64() ? PPC::STD : PPC::STW))
13671 .addReg(RegNo: BaseReg)
13672 .addImm(Val: BPOffset)
13673 .addReg(RegNo: BufReg)
13674 .cloneMemRefs(OtherMI: MI);
13675
13676 // Setup
13677 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::BCLalways)).addMBB(MBB: mainMBB);
13678 MIB.addRegMask(Mask: TRI->getNoPreservedMask());
13679
13680 BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LI), DestReg: restoreDstReg).addImm(Val: 1);
13681
13682 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::EH_SjLj_Setup))
13683 .addMBB(MBB: mainMBB);
13684 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: sinkMBB);
13685
13686 thisMBB->addSuccessor(Succ: mainMBB, Prob: BranchProbability::getZero());
13687 thisMBB->addSuccessor(Succ: sinkMBB, Prob: BranchProbability::getOne());
13688
13689 // mainMBB:
13690 // mainDstReg = 0
13691 MIB =
13692 BuildMI(BB: mainMBB, MIMD: DL,
13693 MCID: TII->get(Opcode: Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), DestReg: LabelReg);
13694
13695 // Store IP
13696 if (Subtarget.isPPC64()) {
13697 MIB = BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::STD))
13698 .addReg(RegNo: LabelReg)
13699 .addImm(Val: LabelOffset)
13700 .addReg(RegNo: BufReg);
13701 } else {
13702 MIB = BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::STW))
13703 .addReg(RegNo: LabelReg)
13704 .addImm(Val: LabelOffset)
13705 .addReg(RegNo: BufReg);
13706 }
13707 MIB.cloneMemRefs(OtherMI: MI);
13708
13709 BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::LI), DestReg: mainDstReg).addImm(Val: 0);
13710 mainMBB->addSuccessor(Succ: sinkMBB);
13711
13712 // sinkMBB:
13713 BuildMI(BB&: *sinkMBB, I: sinkMBB->begin(), MIMD: DL,
13714 MCID: TII->get(Opcode: PPC::PHI), DestReg: DstReg)
13715 .addReg(RegNo: mainDstReg).addMBB(MBB: mainMBB)
13716 .addReg(RegNo: restoreDstReg).addMBB(MBB: thisMBB);
13717
13718 MI.eraseFromParent();
13719 return sinkMBB;
13720}
13721
13722MachineBasicBlock *
13723PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
13724 MachineBasicBlock *MBB) const {
13725 DebugLoc DL = MI.getDebugLoc();
13726 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13727
13728 MachineFunction *MF = MBB->getParent();
13729 MachineRegisterInfo &MRI = MF->getRegInfo();
13730
13731 MVT PVT = getPointerTy(DL: MF->getDataLayout());
13732 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13733 "Invalid Pointer Size!");
13734
13735 const TargetRegisterClass *RC =
13736 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13737 Register Tmp = MRI.createVirtualRegister(RegClass: RC);
13738 // Since FP is only updated here but NOT referenced, it's treated as GPR.
13739 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
13740 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
13741 unsigned BP =
13742 (PVT == MVT::i64)
13743 ? PPC::X30
13744 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
13745 : PPC::R30);
13746
13747 MachineInstrBuilder MIB;
13748
13749 const int64_t LabelOffset = 1 * PVT.getStoreSize();
13750 const int64_t SPOffset = 2 * PVT.getStoreSize();
13751 const int64_t TOCOffset = 3 * PVT.getStoreSize();
13752 const int64_t BPOffset = 4 * PVT.getStoreSize();
13753
13754 Register BufReg = MI.getOperand(i: 0).getReg();
13755
13756 // Reload FP (the jumped-to function may not have had a
13757 // frame pointer, and if so, then its r31 will be restored
13758 // as necessary).
13759 if (PVT == MVT::i64) {
13760 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: FP)
13761 .addImm(Val: 0)
13762 .addReg(RegNo: BufReg);
13763 } else {
13764 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: FP)
13765 .addImm(Val: 0)
13766 .addReg(RegNo: BufReg);
13767 }
13768 MIB.cloneMemRefs(OtherMI: MI);
13769
13770 // Reload IP
13771 if (PVT == MVT::i64) {
13772 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: Tmp)
13773 .addImm(Val: LabelOffset)
13774 .addReg(RegNo: BufReg);
13775 } else {
13776 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: Tmp)
13777 .addImm(Val: LabelOffset)
13778 .addReg(RegNo: BufReg);
13779 }
13780 MIB.cloneMemRefs(OtherMI: MI);
13781
13782 // Reload SP
13783 if (PVT == MVT::i64) {
13784 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: SP)
13785 .addImm(Val: SPOffset)
13786 .addReg(RegNo: BufReg);
13787 } else {
13788 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: SP)
13789 .addImm(Val: SPOffset)
13790 .addReg(RegNo: BufReg);
13791 }
13792 MIB.cloneMemRefs(OtherMI: MI);
13793
13794 // Reload BP
13795 if (PVT == MVT::i64) {
13796 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: BP)
13797 .addImm(Val: BPOffset)
13798 .addReg(RegNo: BufReg);
13799 } else {
13800 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: BP)
13801 .addImm(Val: BPOffset)
13802 .addReg(RegNo: BufReg);
13803 }
13804 MIB.cloneMemRefs(OtherMI: MI);
13805
13806 // Reload TOC
13807 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
13808 setUsesTOCBasePtr(*MBB->getParent());
13809 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: PPC::X2)
13810 .addImm(Val: TOCOffset)
13811 .addReg(RegNo: BufReg)
13812 .cloneMemRefs(OtherMI: MI);
13813 }
13814
13815 // Jump
13816 BuildMI(BB&: *MBB, I&: MI, MIMD: DL,
13817 MCID: TII->get(Opcode: PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(RegNo: Tmp);
13818 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
13819
13820 MI.eraseFromParent();
13821 return MBB;
13822}
13823
13824bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
13825 // If the function specifically requests inline stack probes, emit them.
13826 if (MF.getFunction().hasFnAttribute(Kind: "probe-stack"))
13827 return MF.getFunction().getFnAttribute(Kind: "probe-stack").getValueAsString() ==
13828 "inline-asm";
13829 return false;
13830}
13831
13832unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const {
13833 const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
13834 unsigned StackAlign = TFI->getStackAlignment();
13835 assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
13836 "Unexpected stack alignment");
13837 // The default stack probe size is 4096 if the function has no
13838 // stack-probe-size attribute.
13839 const Function &Fn = MF.getFunction();
13840 unsigned StackProbeSize =
13841 Fn.getFnAttributeAsParsedInteger(Kind: "stack-probe-size", Default: 4096);
13842 // Round down to the stack alignment.
13843 StackProbeSize &= ~(StackAlign - 1);
13844 return StackProbeSize ? StackProbeSize : StackAlign;
13845}
13846
13847// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
13848// into three phases. In the first phase, it uses pseudo instruction
13849// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
13850// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
13851// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
13852// MaxCallFrameSize so that it can calculate correct data area pointer.
13853MachineBasicBlock *
13854PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
13855 MachineBasicBlock *MBB) const {
13856 const bool isPPC64 = Subtarget.isPPC64();
13857 MachineFunction *MF = MBB->getParent();
13858 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13859 DebugLoc DL = MI.getDebugLoc();
13860 const unsigned ProbeSize = getStackProbeSize(MF: *MF);
13861 const BasicBlock *ProbedBB = MBB->getBasicBlock();
13862 MachineRegisterInfo &MRI = MF->getRegInfo();
13863 // The CFG of probing stack looks as
13864 // +-----+
13865 // | MBB |
13866 // +--+--+
13867 // |
13868 // +----v----+
13869 // +--->+ TestMBB +---+
13870 // | +----+----+ |
13871 // | | |
13872 // | +-----v----+ |
13873 // +---+ BlockMBB | |
13874 // +----------+ |
13875 // |
13876 // +---------+ |
13877 // | TailMBB +<--+
13878 // +---------+
13879 // In MBB, calculate previous frame pointer and final stack pointer.
13880 // In TestMBB, test if sp is equal to final stack pointer, if so, jump to
13881 // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
13882 // TailMBB is spliced via \p MI.
13883 MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13884 MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13885 MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13886
13887 MachineFunction::iterator MBBIter = ++MBB->getIterator();
13888 MF->insert(MBBI: MBBIter, MBB: TestMBB);
13889 MF->insert(MBBI: MBBIter, MBB: BlockMBB);
13890 MF->insert(MBBI: MBBIter, MBB: TailMBB);
13891
13892 const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
13893 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13894
13895 Register DstReg = MI.getOperand(i: 0).getReg();
13896 Register NegSizeReg = MI.getOperand(i: 1).getReg();
13897 Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
13898 Register FinalStackPtr = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13899 Register FramePointer = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13900 Register ActualNegSizeReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13901
13902 // Since value of NegSizeReg might be realigned in prologepilog, insert a
13903 // PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
13904 // NegSize.
13905 unsigned ProbeOpc;
13906 if (!MRI.hasOneNonDBGUse(RegNo: NegSizeReg))
13907 ProbeOpc =
13908 isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
13909 else
13910 // By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
13911 // and NegSizeReg will be allocated in the same phyreg to avoid
13912 // redundant copy when NegSizeReg has only one use which is current MI and
13913 // will be replaced by PREPARE_PROBED_ALLOCA then.
13914 ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
13915 : PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
13916 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: ProbeOpc), DestReg: FramePointer)
13917 .addDef(RegNo: ActualNegSizeReg)
13918 .addReg(RegNo: NegSizeReg)
13919 .add(MO: MI.getOperand(i: 2))
13920 .add(MO: MI.getOperand(i: 3));
13921
13922 // Calculate final stack pointer, which equals to SP + ActualNegSize.
13923 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ADD8 : PPC::ADD4),
13924 DestReg: FinalStackPtr)
13925 .addReg(RegNo: SPReg)
13926 .addReg(RegNo: ActualNegSizeReg);
13927
13928 // Materialize a scratch register for update.
13929 int64_t NegProbeSize = -(int64_t)ProbeSize;
13930 assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
13931 Register ScratchReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13932 if (!isInt<16>(x: NegProbeSize)) {
13933 Register TempReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13934 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::LIS8 : PPC::LIS), DestReg: TempReg)
13935 .addImm(Val: NegProbeSize >> 16);
13936 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ORI8 : PPC::ORI),
13937 DestReg: ScratchReg)
13938 .addReg(RegNo: TempReg)
13939 .addImm(Val: NegProbeSize & 0xFFFF);
13940 } else
13941 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::LI8 : PPC::LI), DestReg: ScratchReg)
13942 .addImm(Val: NegProbeSize);
13943
13944 {
13945 // Probing leading residual part.
13946 Register Div = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13947 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::DIVD : PPC::DIVW), DestReg: Div)
13948 .addReg(RegNo: ActualNegSizeReg)
13949 .addReg(RegNo: ScratchReg);
13950 Register Mul = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13951 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::MULLD : PPC::MULLW), DestReg: Mul)
13952 .addReg(RegNo: Div)
13953 .addReg(RegNo: ScratchReg);
13954 Register NegMod = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13955 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::SUBF8 : PPC::SUBF), DestReg: NegMod)
13956 .addReg(RegNo: Mul)
13957 .addReg(RegNo: ActualNegSizeReg);
13958 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::STDUX : PPC::STWUX), DestReg: SPReg)
13959 .addReg(RegNo: FramePointer)
13960 .addReg(RegNo: SPReg)
13961 .addReg(RegNo: NegMod);
13962 }
13963
13964 {
13965 // Remaining part should be multiple of ProbeSize.
13966 Register CmpResult = MRI.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13967 BuildMI(BB: TestMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::CMPD : PPC::CMPW), DestReg: CmpResult)
13968 .addReg(RegNo: SPReg)
13969 .addReg(RegNo: FinalStackPtr);
13970 BuildMI(BB: TestMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::BCC))
13971 .addImm(Val: PPC::PRED_EQ)
13972 .addReg(RegNo: CmpResult)
13973 .addMBB(MBB: TailMBB);
13974 TestMBB->addSuccessor(Succ: BlockMBB);
13975 TestMBB->addSuccessor(Succ: TailMBB);
13976 }
13977
13978 {
13979 // Touch the block.
13980 // |P...|P...|P...
13981 BuildMI(BB: BlockMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::STDUX : PPC::STWUX), DestReg: SPReg)
13982 .addReg(RegNo: FramePointer)
13983 .addReg(RegNo: SPReg)
13984 .addReg(RegNo: ScratchReg);
13985 BuildMI(BB: BlockMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: TestMBB);
13986 BlockMBB->addSuccessor(Succ: TestMBB);
13987 }
13988
13989 // Calculation of MaxCallFrameSize is deferred to prologepilog, use
13990 // DYNAREAOFFSET pseudo instruction to get the future result.
13991 Register MaxCallFrameSizeReg =
13992 MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13993 BuildMI(BB: TailMBB, MIMD: DL,
13994 MCID: TII->get(Opcode: isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
13995 DestReg: MaxCallFrameSizeReg)
13996 .add(MO: MI.getOperand(i: 2))
13997 .add(MO: MI.getOperand(i: 3));
13998 BuildMI(BB: TailMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ADD8 : PPC::ADD4), DestReg: DstReg)
13999 .addReg(RegNo: SPReg)
14000 .addReg(RegNo: MaxCallFrameSizeReg);
14001
14002 // Splice instructions after MI to TailMBB.
14003 TailMBB->splice(Where: TailMBB->end(), Other: MBB,
14004 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
14005 TailMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
14006 MBB->addSuccessor(Succ: TestMBB);
14007
14008 // Delete the pseudo instruction.
14009 MI.eraseFromParent();
14010
14011 ++NumDynamicAllocaProbed;
14012 return TailMBB;
14013}
14014
14015/// Check if the opcode is a SELECT or SELECT_CC variant.
14016/// @param Opcode The opcode to check
14017/// @param CheckOnlyCC If true, only return true for SELECT_CC variants;
14018/// if false, return true for both SELECT and SELECT_CC
14019static bool IsSelect(unsigned Opcode, bool CheckOnlyCC = false) {
14020 switch (Opcode) {
14021 // SELECT_CC variants - always return true
14022 case PPC::SELECT_CC_I4:
14023 case PPC::SELECT_CC_I8:
14024 case PPC::SELECT_CC_F4:
14025 case PPC::SELECT_CC_F8:
14026 case PPC::SELECT_CC_F16:
14027 case PPC::SELECT_CC_VRRC:
14028 case PPC::SELECT_CC_VSFRC:
14029 case PPC::SELECT_CC_VSSRC:
14030 case PPC::SELECT_CC_VSRC:
14031 case PPC::SELECT_CC_SPE4:
14032 case PPC::SELECT_CC_SPE:
14033 return true;
14034 // SELECT variants - only return true if CheckOnlyCC is false
14035 case PPC::SELECT_I4:
14036 case PPC::SELECT_I8:
14037 case PPC::SELECT_F4:
14038 case PPC::SELECT_F8:
14039 case PPC::SELECT_F16:
14040 case PPC::SELECT_SPE:
14041 case PPC::SELECT_SPE4:
14042 case PPC::SELECT_VRRC:
14043 case PPC::SELECT_VSFRC:
14044 case PPC::SELECT_VSSRC:
14045 case PPC::SELECT_VSRC:
14046 return !CheckOnlyCC; // true if checking all SELECTs, false if only CC
14047 default:
14048 return false;
14049 }
14050}
14051static bool IsSelectCC(unsigned Opcode) { return IsSelect(Opcode, CheckOnlyCC: true); }
14052
14053/// Emit SELECT instruction, using ISEL if available, otherwise use
14054/// branch-based control flow.
14055///
14056/// For targets with ISEL support (SELECT_CC_I4/I8, SELECT_I4/I8), this
14057/// generates a single ISEL instruction. Otherwise, it creates a
14058/// branch-based control flow pattern with PHI nodes.
14059static MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB,
14060 const TargetInstrInfo *TII,
14061 const PPCSubtarget &Subtarget) {
14062 assert(IsSelect(MI.getOpcode()) && "Instruction must be a SELECT variant");
14063
14064 // Check if we can use ISEL for this SELECT
14065 if (Subtarget.hasISEL() &&
14066 (MI.getOpcode() == PPC::SELECT_CC_I4 ||
14067 MI.getOpcode() == PPC::SELECT_CC_I8 ||
14068 MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
14069 SmallVector<MachineOperand, 2> Cond;
14070 if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
14071 MI.getOpcode() == PPC::SELECT_CC_I8)
14072 Cond.push_back(Elt: MI.getOperand(i: 4));
14073 else
14074 Cond.push_back(Elt: MachineOperand::CreateImm(Val: PPC::PRED_BIT_SET));
14075 Cond.push_back(Elt: MI.getOperand(i: 1));
14076
14077 DebugLoc dl = MI.getDebugLoc();
14078 TII->insertSelect(MBB&: *BB, I: MI, DL: dl, DstReg: MI.getOperand(i: 0).getReg(), Cond,
14079 TrueReg: MI.getOperand(i: 2).getReg(), FalseReg: MI.getOperand(i: 3).getReg());
14080 MI.eraseFromParent();
14081 return BB;
14082 }
14083
14084 // Fall back to branch-based SELECT implementation
14085 MachineFunction *F = BB->getParent();
14086 const BasicBlock *LLVM_BB = BB->getBasicBlock();
14087 MachineFunction::iterator It = ++BB->getIterator();
14088 DebugLoc dl = MI.getDebugLoc();
14089
14090 MachineBasicBlock *thisMBB = BB;
14091 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14092 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14093 F->insert(MBBI: It, MBB: copy0MBB);
14094 F->insert(MBBI: It, MBB: sinkMBB);
14095
14096 if (isPhysRegUsedAfter(Reg: PPC::CARRY, MBI: MI.getIterator())) {
14097 copy0MBB->addLiveIn(PhysReg: PPC::CARRY);
14098 sinkMBB->addLiveIn(PhysReg: PPC::CARRY);
14099 }
14100
14101 // Set the call frame size on entry to the new basic blocks.
14102 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
14103 copy0MBB->setCallFrameSize(CallFrameSize);
14104 sinkMBB->setCallFrameSize(CallFrameSize);
14105
14106 // Transfer the remainder of BB and its successor edges to sinkMBB.
14107 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
14108 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14109 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14110
14111 // Add successors
14112 BB->addSuccessor(Succ: copy0MBB);
14113 BB->addSuccessor(Succ: sinkMBB);
14114
14115 // Build branch instruction
14116 if (IsSelectCC(Opcode: MI.getOpcode()))
14117 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14118 .addImm(Val: MI.getOperand(i: 4).getImm())
14119 .addReg(RegNo: MI.getOperand(i: 1).getReg())
14120 .addMBB(MBB: sinkMBB);
14121 else
14122 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BC))
14123 .addReg(RegNo: MI.getOperand(i: 1).getReg())
14124 .addMBB(MBB: sinkMBB);
14125
14126 // copy0MBB: fallthrough to sinkMBB
14127 BB = copy0MBB;
14128 BB->addSuccessor(Succ: sinkMBB);
14129
14130 // sinkMBB: PHI instruction
14131 BB = sinkMBB;
14132 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::PHI), DestReg: MI.getOperand(i: 0).getReg())
14133 .addReg(RegNo: MI.getOperand(i: 3).getReg())
14134 .addMBB(MBB: copy0MBB)
14135 .addReg(RegNo: MI.getOperand(i: 2).getReg())
14136 .addMBB(MBB: thisMBB);
14137 MI.eraseFromParent();
14138 return BB;
14139}
14140
14141/// Helper function to create basic blocks for atomic compare-and-swap.
14142/// Creates three basic blocks (loop1MBB, loop2MBB, exitMBB) and sets up
14143/// the control flow structure common to both hardware and software
14144/// implementations of atomic compare-and-swap operations.
14145static void createAtomicLoopBlocks(MachineFunction *F, MachineBasicBlock *BB,
14146 MachineBasicBlock *&loop1MBB,
14147 MachineBasicBlock *&loop2MBB,
14148 MachineBasicBlock *&exitMBB,
14149 MachineInstr &MI,
14150 MachineFunction::iterator It) {
14151 const BasicBlock *LLVM_BB = BB->getBasicBlock();
14152 loop1MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14153 loop2MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14154 exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14155 F->insert(MBBI: It, MBB: loop1MBB);
14156 F->insert(MBBI: It, MBB: loop2MBB);
14157 F->insert(MBBI: It, MBB: exitMBB);
14158 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
14159 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14160 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14161 BB->addSuccessor(Succ: loop1MBB);
14162}
14163
14164/// Emit hardware-supported atomic compare-and-swap for I32/I64 and I8/I16
14165/// with partword atomic support.
14166///
14167/// This uses native PowerPC atomic instructions (LBARX/LHARX/LWARX/LDARX for
14168/// load-and-reserve, STBCX/STHCX/STWCX/STDCX for store-conditional) to
14169/// implement atomic compare-and-swap at byte, halfword, word, or doubleword
14170/// granularity.
14171///
14172/// Control flow:
14173/// thisMBB -> loop1MBB -> loop2MBB -> exitMBB
14174/// | |
14175/// +------------+
14176///
14177/// loop1MBB:
14178/// - Load-and-reserve from memory
14179/// - Compare loaded value with expected old value
14180/// - Branch to exitMBB if not equal (CAS failed)
14181/// loop2MBB:
14182/// - Store-conditional new value to memory
14183/// - Branch back to loop1MBB if store failed (retry)
14184/// - Fall through to exitMBB on success
14185static MachineBasicBlock *
14186emitAtomicCmpSwapHardware(MachineInstr &MI, MachineBasicBlock *BB,
14187 const TargetInstrInfo *TII,
14188 const PPCSubtarget &Subtarget) {
14189 MachineFunction *F = BB->getParent();
14190 MachineFunction::iterator It = ++BB->getIterator();
14191
14192 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
14193
14194 unsigned LoadMnemonic = PPC::LDARX;
14195 unsigned StoreMnemonic = PPC::STDCX;
14196 switch (MI.getOpcode()) {
14197 default:
14198 llvm_unreachable("Compare and swap of unknown size");
14199 case PPC::ATOMIC_CMP_SWAP_I8:
14200 LoadMnemonic = PPC::LBARX;
14201 StoreMnemonic = PPC::STBCX;
14202 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
14203 break;
14204 case PPC::ATOMIC_CMP_SWAP_I16:
14205 LoadMnemonic = PPC::LHARX;
14206 StoreMnemonic = PPC::STHCX;
14207 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
14208 break;
14209 case PPC::ATOMIC_CMP_SWAP_I32:
14210 LoadMnemonic = PPC::LWARX;
14211 StoreMnemonic = PPC::STWCX;
14212 break;
14213 case PPC::ATOMIC_CMP_SWAP_I64:
14214 LoadMnemonic = PPC::LDARX;
14215 StoreMnemonic = PPC::STDCX;
14216 break;
14217 }
14218
14219 MachineRegisterInfo &RegInfo = F->getRegInfo();
14220 Register dest = MI.getOperand(i: 0).getReg();
14221 Register ptrA = MI.getOperand(i: 1).getReg();
14222 Register ptrB = MI.getOperand(i: 2).getReg();
14223 Register oldval = MI.getOperand(i: 3).getReg();
14224 Register newval = MI.getOperand(i: 4).getReg();
14225 DebugLoc dl = MI.getDebugLoc();
14226
14227 MachineBasicBlock *loop1MBB, *loop2MBB, *exitMBB;
14228 createAtomicLoopBlocks(F, BB, loop1MBB, loop2MBB, exitMBB, MI, It);
14229
14230 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14231
14232 // loop1MBB:
14233 // l[bhwd]arx dest, ptr
14234 // cmp[wd] dest, oldval
14235 // bne- exitBB
14236 BB = loop1MBB;
14237 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: LoadMnemonic), DestReg: dest).addReg(RegNo: ptrA).addReg(RegNo: ptrB);
14238 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::CMPD : PPC::CMPW), DestReg: CrReg)
14239 .addReg(RegNo: dest)
14240 .addReg(RegNo: oldval);
14241 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14242 .addImm(Val: PPC::PRED_NE_MINUS)
14243 .addReg(RegNo: CrReg)
14244 .addMBB(MBB: exitMBB);
14245 BB->addSuccessor(Succ: loop2MBB);
14246 BB->addSuccessor(Succ: exitMBB);
14247
14248 // loop2MBB:
14249 // st[bhwd]cx. newval, ptr
14250 // bne- loopMBB
14251 // b exitBB
14252 BB = loop2MBB;
14253 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: StoreMnemonic))
14254 .addReg(RegNo: newval)
14255 .addReg(RegNo: ptrA)
14256 .addReg(RegNo: ptrB);
14257 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14258 .addImm(Val: PPC::PRED_NE_MINUS)
14259 .addReg(RegNo: PPC::CR0)
14260 .addMBB(MBB: loop1MBB);
14261 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: exitMBB);
14262 BB->addSuccessor(Succ: loop1MBB);
14263 BB->addSuccessor(Succ: exitMBB);
14264
14265 return exitMBB;
14266}
14267
14268/// Emit software-emulated atomic compare-and-swap for I8/I16 without
14269/// hardware partword atomic support.
14270///
14271/// This emulates byte/halfword atomic operations using word (32-bit) atomic
14272/// instructions. Since PowerPC atomic instructions work at word granularity,
14273/// we must:
14274/// 1. Align the pointer to a word boundary
14275/// 2. Calculate the bit shift for the target byte/halfword within the word
14276/// 3. Create masks to isolate the target byte/halfword
14277/// 4. Shift old/new values into the correct bit position
14278/// 5. Use LWARX/STWCX on the full word
14279/// 6. Mask and merge to preserve other bytes in the word
14280/// 7. Extract and shift the result back
14281///
14282/// Control flow:
14283/// thisMBB -> loop1MBB -> loop2MBB -> exitMBB
14284/// | |
14285/// +------------+
14286///
14287/// loop1MBB:
14288/// - LWARX: Load-and-reserve full word
14289/// - Mask to extract target byte/halfword
14290/// - Compare with expected old value
14291/// - Branch to exitMBB if not equal (CAS failed)
14292/// loop2MBB:
14293/// - Merge new value with other bytes in the word
14294/// - STWCX: Store-conditional full word
14295/// - Branch back to loop1MBB if store failed (retry)
14296/// - Fall through to exitMBB on success
14297/// exitMBB:
14298/// - Extract and return the loaded value
14299static MachineBasicBlock *
14300emitAtomicCmpSwapSoftware(MachineInstr &MI, MachineBasicBlock *BB,
14301 const TargetInstrInfo *TII,
14302 const PPCSubtarget &Subtarget) {
14303 MachineFunction *F = BB->getParent();
14304 MachineFunction::iterator It = ++BB->getIterator();
14305
14306 bool is64bit = Subtarget.isPPC64();
14307 bool isLittleEndian = Subtarget.isLittleEndian();
14308 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
14309
14310 Register dest = MI.getOperand(i: 0).getReg();
14311 Register ptrA = MI.getOperand(i: 1).getReg();
14312 Register ptrB = MI.getOperand(i: 2).getReg();
14313 Register oldval = MI.getOperand(i: 3).getReg();
14314 Register newval = MI.getOperand(i: 4).getReg();
14315 DebugLoc dl = MI.getDebugLoc();
14316
14317 MachineBasicBlock *loop1MBB, *loop2MBB, *exitMBB;
14318 createAtomicLoopBlocks(F, BB, loop1MBB, loop2MBB, exitMBB, MI, It);
14319
14320 MachineRegisterInfo &RegInfo = F->getRegInfo();
14321 const TargetRegisterClass *RC =
14322 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
14323 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
14324
14325 // Lambda to create virtual registers
14326 auto createVReg = [&](const TargetRegisterClass *RC) {
14327 return RegInfo.createVirtualRegister(RegClass: RC);
14328 };
14329
14330 Register PtrReg = createVReg(RC);
14331 Register Shift1Reg = createVReg(GPRC);
14332 Register ShiftReg = isLittleEndian ? Shift1Reg : createVReg(GPRC);
14333 Register NewVal2Reg = createVReg(GPRC);
14334 Register NewVal3Reg = createVReg(GPRC);
14335 Register OldVal2Reg = createVReg(GPRC);
14336 Register OldVal3Reg = createVReg(GPRC);
14337 Register MaskReg = createVReg(GPRC);
14338 Register Mask2Reg = createVReg(GPRC);
14339 Register Mask3Reg = createVReg(GPRC);
14340 Register Tmp2Reg = createVReg(GPRC);
14341 Register Tmp4Reg = createVReg(GPRC);
14342 Register TmpDestReg = createVReg(GPRC);
14343 Register TmpReg = createVReg(GPRC);
14344 Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
14345 Register CrReg = createVReg(&PPC::CRRCRegClass);
14346
14347 // Compute aligned pointer and shift amount
14348 Register Ptr1Reg;
14349 if (ptrA != ZeroReg) {
14350 Ptr1Reg = createVReg(RC);
14351 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::ADD8 : PPC::ADD4), DestReg: Ptr1Reg)
14352 .addReg(RegNo: ptrA)
14353 .addReg(RegNo: ptrB);
14354 } else {
14355 Ptr1Reg = ptrB;
14356 }
14357
14358 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: Shift1Reg)
14359 .addReg(RegNo: Ptr1Reg, Flags: {}, SubReg: is64bit ? PPC::sub_32 : 0)
14360 .addImm(Val: 3)
14361 .addImm(Val: 27)
14362 .addImm(Val: is8bit ? 28 : 27);
14363 if (!isLittleEndian)
14364 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::XORI), DestReg: ShiftReg)
14365 .addReg(RegNo: Shift1Reg)
14366 .addImm(Val: is8bit ? 24 : 16);
14367 if (is64bit)
14368 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDICR), DestReg: PtrReg)
14369 .addReg(RegNo: Ptr1Reg)
14370 .addImm(Val: 0)
14371 .addImm(Val: 61);
14372 else
14373 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: PtrReg)
14374 .addReg(RegNo: Ptr1Reg)
14375 .addImm(Val: 0)
14376 .addImm(Val: 0)
14377 .addImm(Val: 29);
14378
14379 // Prepare masked values
14380 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: NewVal2Reg)
14381 .addReg(RegNo: newval)
14382 .addReg(RegNo: ShiftReg);
14383 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: OldVal2Reg)
14384 .addReg(RegNo: oldval)
14385 .addReg(RegNo: ShiftReg);
14386 if (is8bit)
14387 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask2Reg).addImm(Val: 255);
14388 else {
14389 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask3Reg).addImm(Val: 0);
14390 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ORI), DestReg: Mask2Reg)
14391 .addReg(RegNo: Mask3Reg)
14392 .addImm(Val: 65535);
14393 }
14394 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: MaskReg)
14395 .addReg(RegNo: Mask2Reg)
14396 .addReg(RegNo: ShiftReg);
14397 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: NewVal3Reg)
14398 .addReg(RegNo: NewVal2Reg)
14399 .addReg(RegNo: MaskReg);
14400 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: OldVal3Reg)
14401 .addReg(RegNo: OldVal2Reg)
14402 .addReg(RegNo: MaskReg);
14403
14404 // loop1MBB:
14405 // lwarx tmpDest, ptr
14406 // and tmp, tmpDest, mask
14407 // cmpw tmp, oldval3
14408 // bne- exitBB
14409 BB = loop1MBB;
14410 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LWARX), DestReg: TmpDestReg)
14411 .addReg(RegNo: ZeroReg)
14412 .addReg(RegNo: PtrReg);
14413 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: TmpReg)
14414 .addReg(RegNo: TmpDestReg)
14415 .addReg(RegNo: MaskReg);
14416 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::CMPW), DestReg: CrReg).addReg(RegNo: TmpReg).addReg(RegNo: OldVal3Reg);
14417 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14418 .addImm(Val: PPC::PRED_NE)
14419 .addReg(RegNo: CrReg)
14420 .addMBB(MBB: exitMBB);
14421 BB->addSuccessor(Succ: loop2MBB);
14422 BB->addSuccessor(Succ: exitMBB);
14423
14424 // loop2MBB:
14425 // andc tmp2, tmpDest, mask
14426 // or tmp4, tmp2, newval3
14427 // stwcx. tmp4, ptr
14428 // bne- loop1MBB
14429 // b exitBB
14430 BB = loop2MBB;
14431 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ANDC), DestReg: Tmp2Reg)
14432 .addReg(RegNo: TmpDestReg)
14433 .addReg(RegNo: MaskReg);
14434 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::OR), DestReg: Tmp4Reg)
14435 .addReg(RegNo: Tmp2Reg)
14436 .addReg(RegNo: NewVal3Reg);
14437 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::STWCX))
14438 .addReg(RegNo: Tmp4Reg)
14439 .addReg(RegNo: ZeroReg)
14440 .addReg(RegNo: PtrReg);
14441 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14442 .addImm(Val: PPC::PRED_NE)
14443 .addReg(RegNo: PPC::CR0)
14444 .addMBB(MBB: loop1MBB);
14445 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: exitMBB);
14446 BB->addSuccessor(Succ: loop1MBB);
14447 BB->addSuccessor(Succ: exitMBB);
14448
14449 // exitMBB:
14450 // srw dest, tmpDest, shift
14451 BB = exitMBB;
14452 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: dest)
14453 .addReg(RegNo: TmpReg)
14454 .addReg(RegNo: ShiftReg);
14455
14456 return BB;
14457}
14458
14459MachineBasicBlock *
14460PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
14461 MachineBasicBlock *BB) const {
14462 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
14463
14464 // To "insert" these instructions we actually have to insert their
14465 // control-flow patterns.
14466 const BasicBlock *LLVM_BB = BB->getBasicBlock();
14467 MachineFunction::iterator It = ++BB->getIterator();
14468
14469 MachineFunction *F = BB->getParent();
14470 MachineRegisterInfo &MRI = F->getRegInfo();
14471
14472 // Handle SELECT with ISEL support first (before generic SELECT handling)
14473 if (IsSelect(Opcode: MI.getOpcode()))
14474 return emitSelect(MI, BB, TII, Subtarget);
14475
14476 switch (MI.getOpcode()) {
14477 case TargetOpcode::STACKMAP:
14478 return emitPatchPoint(MI, MBB: BB);
14479 case TargetOpcode::PATCHPOINT:
14480 // Call lowering should have added an r2 operand to indicate a dependence
14481 // on the TOC base pointer value. It can't however, because there is no
14482 // way to mark the dependence as implicit there, and so the stackmap code
14483 // will confuse it with a regular operand. Instead, add the dependence
14484 // here.
14485 if (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls())
14486 MI.addOperand(Op: MachineOperand::CreateReg(Reg: PPC::X2, isDef: false, isImp: true));
14487 return emitPatchPoint(MI, MBB: BB);
14488
14489 case PPC::EH_SjLj_SetJmp32:
14490 case PPC::EH_SjLj_SetJmp64:
14491 return emitEHSjLjSetJmp(MI, MBB: BB);
14492
14493 case PPC::EH_SjLj_LongJmp32:
14494 case PPC::EH_SjLj_LongJmp64:
14495 return emitEHSjLjLongJmp(MI, MBB: BB);
14496
14497 case PPC::ReadTB: {
14498 // To read the 64-bit time-base register on a 32-bit target, we read the
14499 // two halves. Should the counter have wrapped while it was being read, we
14500 // need to try again.
14501 // ...
14502 // readLoop:
14503 // mfspr Rx,TBU # load from TBU
14504 // mfspr Ry,TB # load from TB
14505 // mfspr Rz,TBU # load from TBU
14506 // cmpw crX,Rx,Rz # check if 'old'='new'
14507 // bne readLoop # branch if they're not equal
14508 // ...
14509
14510 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14511 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14512 DebugLoc dl = MI.getDebugLoc();
14513 F->insert(MBBI: It, MBB: readMBB);
14514 F->insert(MBBI: It, MBB: sinkMBB);
14515
14516 // Transfer the remainder of BB and its successor edges to sinkMBB.
14517 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
14518 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14519 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14520
14521 BB->addSuccessor(Succ: readMBB);
14522 BB = readMBB;
14523
14524 MachineRegisterInfo &RegInfo = F->getRegInfo();
14525 Register ReadAgainReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
14526 Register LoReg = MI.getOperand(i: 0).getReg();
14527 Register HiReg = MI.getOperand(i: 1).getReg();
14528
14529 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: HiReg).addImm(Val: 269);
14530 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: LoReg).addImm(Val: 268);
14531 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: ReadAgainReg).addImm(Val: 269);
14532
14533 Register CmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14534
14535 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::CMPW), DestReg: CmpReg)
14536 .addReg(RegNo: HiReg)
14537 .addReg(RegNo: ReadAgainReg);
14538 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14539 .addImm(Val: PPC::PRED_NE)
14540 .addReg(RegNo: CmpReg)
14541 .addMBB(MBB: readMBB);
14542
14543 BB->addSuccessor(Succ: readMBB);
14544 BB->addSuccessor(Succ: sinkMBB);
14545 break;
14546 }
14547 case PPC::ATOMIC_LOAD_ADD_NOWP:
14548 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: PPC::ADD4);
14549 break;
14550 case PPC::ATOMIC_LOAD_ADD:
14551 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::ADD4);
14552 break;
14553 case PPC::ATOMIC_LOAD_ADD_I64:
14554 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::ADD8);
14555 break;
14556 case PPC::ATOMIC_LOAD_AND_NOWP:
14557 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: PPC::AND);
14558 break;
14559 case PPC::ATOMIC_LOAD_AND:
14560 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::AND);
14561 break;
14562 case PPC::ATOMIC_LOAD_AND_I64:
14563 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::AND8);
14564 break;
14565 case PPC::ATOMIC_LOAD_OR_NOWP:
14566 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: PPC::OR);
14567 break;
14568 case PPC::ATOMIC_LOAD_OR:
14569 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::OR);
14570 break;
14571 case PPC::ATOMIC_LOAD_OR_I64:
14572 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::OR8);
14573 break;
14574 case PPC::ATOMIC_LOAD_XOR_NOWP:
14575 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: PPC::XOR);
14576 break;
14577 case PPC::ATOMIC_LOAD_XOR:
14578 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::XOR);
14579 break;
14580 case PPC::ATOMIC_LOAD_XOR_I64:
14581 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::XOR8);
14582 break;
14583 case PPC::ATOMIC_LOAD_NAND_NOWP:
14584 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: PPC::NAND);
14585 break;
14586 case PPC::ATOMIC_LOAD_NAND:
14587 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::NAND);
14588 break;
14589 case PPC::ATOMIC_LOAD_NAND_I64:
14590 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::NAND8);
14591 break;
14592 case PPC::ATOMIC_LOAD_SUB_NOWP:
14593 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: PPC::SUBF);
14594 break;
14595 case PPC::ATOMIC_LOAD_SUB:
14596 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::SUBF);
14597 break;
14598 case PPC::ATOMIC_LOAD_SUB_I64:
14599 BB = EmitAtomicBinary(MI, BB, BinOpcode: PPC::SUBF8);
14600 break;
14601 case PPC::ATOMIC_LOAD_MIN_NOWP:
14602 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14603 break;
14604 case PPC::ATOMIC_LOAD_MIN:
14605 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14606 break;
14607 case PPC::ATOMIC_LOAD_MIN_I64:
14608 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPD, CmpPred: PPC::PRED_LT);
14609 break;
14610 case PPC::ATOMIC_LOAD_MAX_NOWP:
14611 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14612 break;
14613 case PPC::ATOMIC_LOAD_MAX:
14614 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14615 break;
14616 case PPC::ATOMIC_LOAD_MAX_I64:
14617 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPD, CmpPred: PPC::PRED_GT);
14618 break;
14619 case PPC::ATOMIC_LOAD_UMIN_NOWP:
14620 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14621 break;
14622 case PPC::ATOMIC_LOAD_UMIN:
14623 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14624 break;
14625 case PPC::ATOMIC_LOAD_UMIN_I64:
14626 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPLD, CmpPred: PPC::PRED_LT);
14627 break;
14628 case PPC::ATOMIC_LOAD_UMAX_NOWP:
14629 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14630 break;
14631 case PPC::ATOMIC_LOAD_UMAX:
14632 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14633 break;
14634 case PPC::ATOMIC_LOAD_UMAX_I64:
14635 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0, CmpOpcode: PPC::CMPLD, CmpPred: PPC::PRED_GT);
14636 break;
14637 case PPC::ATOMIC_SWAP_NOWP:
14638 BB = EmitPartwordAtomicBinary(MI, BB, BinOpcode: 0);
14639 break;
14640 case PPC::ATOMIC_SWAP:
14641 case PPC::ATOMIC_SWAP_I64:
14642 BB = EmitAtomicBinary(MI, BB, BinOpcode: 0);
14643 break;
14644 case PPC::ATOMIC_CMP_SWAP_I32:
14645 case PPC::ATOMIC_CMP_SWAP_I64:
14646 case PPC::ATOMIC_CMP_SWAP_I8:
14647 case PPC::ATOMIC_CMP_SWAP_I16: {
14648 // Use hardware-supported atomic operations if available
14649 bool useHardware = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
14650 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
14651 (Subtarget.hasPartwordAtomics() &&
14652 (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
14653 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16));
14654
14655 if (useHardware)
14656 BB = emitAtomicCmpSwapHardware(MI, BB, TII, Subtarget);
14657 else
14658 BB = emitAtomicCmpSwapSoftware(MI, BB, TII, Subtarget);
14659 break;
14660 }
14661 case PPC::FADDrtz: {
14662 // This pseudo performs an FADD with rounding mode temporarily forced
14663 // to round-to-zero. We emit this via custom inserter since the FPSCR
14664 // is not modeled at the SelectionDAG level.
14665 Register Dest = MI.getOperand(i: 0).getReg();
14666 Register Src1 = MI.getOperand(i: 1).getReg();
14667 Register Src2 = MI.getOperand(i: 2).getReg();
14668 DebugLoc dl = MI.getDebugLoc();
14669
14670 MachineRegisterInfo &RegInfo = F->getRegInfo();
14671 Register MFFSReg = RegInfo.createVirtualRegister(RegClass: &PPC::F8RCRegClass);
14672
14673 // Save FPSCR value.
14674 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: MFFSReg);
14675
14676 // Set rounding mode to round-to-zero.
14677 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSB1))
14678 .addImm(Val: 31)
14679 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14680
14681 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSB0))
14682 .addImm(Val: 30)
14683 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14684
14685 // Perform addition.
14686 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::FADD), DestReg: Dest)
14687 .addReg(RegNo: Src1)
14688 .addReg(RegNo: Src2);
14689 if (MI.getFlag(Flag: MachineInstr::NoFPExcept))
14690 MIB.setMIFlag(MachineInstr::NoFPExcept);
14691
14692 // Restore FPSCR value.
14693 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSFb)).addImm(Val: 1).addReg(RegNo: MFFSReg);
14694 break;
14695 }
14696 case PPC::ANDI_rec_1_EQ_BIT:
14697 case PPC::ANDI_rec_1_GT_BIT:
14698 case PPC::ANDI_rec_1_EQ_BIT8:
14699 case PPC::ANDI_rec_1_GT_BIT8: {
14700 unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
14701 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
14702 ? PPC::ANDI8_rec
14703 : PPC::ANDI_rec;
14704 bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
14705 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
14706
14707 MachineRegisterInfo &RegInfo = F->getRegInfo();
14708 Register Dest = RegInfo.createVirtualRegister(
14709 RegClass: Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
14710
14711 DebugLoc Dl = MI.getDebugLoc();
14712 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode), DestReg: Dest)
14713 .addReg(RegNo: MI.getOperand(i: 1).getReg())
14714 .addImm(Val: 1);
14715 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14716 DestReg: MI.getOperand(i: 0).getReg())
14717 .addReg(RegNo: IsEQ ? PPC::CR0EQ : PPC::CR0GT);
14718 break;
14719 }
14720 case PPC::TCHECK_RET: {
14721 DebugLoc Dl = MI.getDebugLoc();
14722 MachineRegisterInfo &RegInfo = F->getRegInfo();
14723 Register CRReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14724 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::TCHECK), DestReg: CRReg);
14725 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14726 DestReg: MI.getOperand(i: 0).getReg())
14727 .addReg(RegNo: CRReg);
14728 break;
14729 }
14730 case PPC::TBEGIN_RET: {
14731 DebugLoc Dl = MI.getDebugLoc();
14732 unsigned Imm = MI.getOperand(i: 1).getImm();
14733 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::TBEGIN)).addImm(Val: Imm);
14734 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14735 DestReg: MI.getOperand(i: 0).getReg())
14736 .addReg(RegNo: PPC::CR0EQ);
14737 break;
14738 }
14739 case PPC::SETRNDi: {
14740 DebugLoc dl = MI.getDebugLoc();
14741 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14742
14743 // Save FPSCR value.
14744 if (MRI.use_empty(RegNo: OldFPSCRReg))
14745 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: OldFPSCRReg);
14746 else
14747 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14748
14749 // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
14750 // the following settings:
14751 // 00 Round to nearest
14752 // 01 Round to 0
14753 // 10 Round to +inf
14754 // 11 Round to -inf
14755
14756 // When the operand is immediate, using the two least significant bits of
14757 // the immediate to set the bits 62:63 of FPSCR.
14758 unsigned Mode = MI.getOperand(i: 1).getImm();
14759 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: (Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
14760 .addImm(Val: 31)
14761 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14762
14763 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: (Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
14764 .addImm(Val: 30)
14765 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14766 break;
14767 }
14768 case PPC::SETRND: {
14769 DebugLoc dl = MI.getDebugLoc();
14770
14771 // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
14772 // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
14773 // If the target doesn't have DirectMove, we should use stack to do the
14774 // conversion, because the target doesn't have the instructions like mtvsrd
14775 // or mfvsrd to do this conversion directly.
14776 auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
14777 if (Subtarget.hasDirectMove()) {
14778 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg)
14779 .addReg(RegNo: SrcReg);
14780 } else {
14781 // Use stack to do the register copy.
14782 unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
14783 MachineRegisterInfo &RegInfo = F->getRegInfo();
14784 const TargetRegisterClass *RC = RegInfo.getRegClass(Reg: SrcReg);
14785 if (RC == &PPC::F8RCRegClass) {
14786 // Copy register from F8RCRegClass to G8RCRegclass.
14787 assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
14788 "Unsupported RegClass.");
14789
14790 StoreOp = PPC::STFD;
14791 LoadOp = PPC::LD;
14792 } else {
14793 // Copy register from G8RCRegClass to F8RCRegclass.
14794 assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
14795 (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
14796 "Unsupported RegClass.");
14797 }
14798
14799 MachineFrameInfo &MFI = F->getFrameInfo();
14800 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
14801
14802 MachineMemOperand *MMOStore = F->getMachineMemOperand(
14803 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *F, FI: FrameIdx, Offset: 0),
14804 F: MachineMemOperand::MOStore, Size: MFI.getObjectSize(ObjectIdx: FrameIdx),
14805 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIdx));
14806
14807 // Store the SrcReg into the stack.
14808 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: StoreOp))
14809 .addReg(RegNo: SrcReg)
14810 .addImm(Val: 0)
14811 .addFrameIndex(Idx: FrameIdx)
14812 .addMemOperand(MMO: MMOStore);
14813
14814 MachineMemOperand *MMOLoad = F->getMachineMemOperand(
14815 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *F, FI: FrameIdx, Offset: 0),
14816 F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIdx),
14817 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIdx));
14818
14819 // Load from the stack where SrcReg is stored, and save to DestReg,
14820 // so we have done the RegClass conversion from RegClass::SrcReg to
14821 // RegClass::DestReg.
14822 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: LoadOp), DestReg)
14823 .addImm(Val: 0)
14824 .addFrameIndex(Idx: FrameIdx)
14825 .addMemOperand(MMO: MMOLoad);
14826 }
14827 };
14828
14829 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14830
14831 // Save FPSCR value.
14832 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14833
14834 // When the operand is gprc register, use two least significant bits of the
14835 // register and mtfsf instruction to set the bits 62:63 of FPSCR.
14836 //
14837 // copy OldFPSCRTmpReg, OldFPSCRReg
14838 // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
14839 // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
14840 // copy NewFPSCRReg, NewFPSCRTmpReg
14841 // mtfsf 255, NewFPSCRReg
14842 MachineOperand SrcOp = MI.getOperand(i: 1);
14843 MachineRegisterInfo &RegInfo = F->getRegInfo();
14844 Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14845
14846 copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
14847
14848 Register ImDefReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14849 Register ExtSrcReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14850
14851 // The first operand of INSERT_SUBREG should be a register which has
14852 // subregisters, we only care about its RegClass, so we should use an
14853 // IMPLICIT_DEF register.
14854 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: ImDefReg);
14855 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::INSERT_SUBREG), DestReg: ExtSrcReg)
14856 .addReg(RegNo: ImDefReg)
14857 .add(MO: SrcOp)
14858 .addImm(Val: 1);
14859
14860 Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14861 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDIMI), DestReg: NewFPSCRTmpReg)
14862 .addReg(RegNo: OldFPSCRTmpReg)
14863 .addReg(RegNo: ExtSrcReg)
14864 .addImm(Val: 0)
14865 .addImm(Val: 62);
14866
14867 Register NewFPSCRReg = RegInfo.createVirtualRegister(RegClass: &PPC::F8RCRegClass);
14868 copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
14869
14870 // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
14871 // bits of FPSCR.
14872 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSF))
14873 .addImm(Val: 255)
14874 .addReg(RegNo: NewFPSCRReg)
14875 .addImm(Val: 0)
14876 .addImm(Val: 0);
14877 break;
14878 }
14879 case PPC::SETFLM: {
14880 DebugLoc Dl = MI.getDebugLoc();
14881
14882 // Result of setflm is previous FPSCR content, so we need to save it first.
14883 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14884 if (MRI.use_empty(RegNo: OldFPSCRReg))
14885 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: OldFPSCRReg);
14886 else
14887 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14888
14889 // Put bits in 32:63 to FPSCR.
14890 Register NewFPSCRReg = MI.getOperand(i: 1).getReg();
14891 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::MTFSF))
14892 .addImm(Val: 255)
14893 .addReg(RegNo: NewFPSCRReg)
14894 .addImm(Val: 0)
14895 .addImm(Val: 0);
14896 break;
14897 }
14898 case PPC::PROBED_ALLOCA_32:
14899 case PPC::PROBED_ALLOCA_64:
14900 return emitProbedAlloca(MI, MBB: BB);
14901
14902 case PPC::SPLIT_QUADWORD: {
14903 DebugLoc DL = MI.getDebugLoc();
14904 Register Src = MI.getOperand(i: 2).getReg();
14905 Register Lo = MI.getOperand(i: 0).getReg();
14906 Register Hi = MI.getOperand(i: 1).getReg();
14907 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY))
14908 .addDef(RegNo: Lo)
14909 .addUse(RegNo: Src, Flags: {}, SubReg: PPC::sub_gp8_x1);
14910 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY))
14911 .addDef(RegNo: Hi)
14912 .addUse(RegNo: Src, Flags: {}, SubReg: PPC::sub_gp8_x0);
14913 break;
14914 }
14915 case PPC::LQX_PSEUDO:
14916 case PPC::STQX_PSEUDO: {
14917 DebugLoc DL = MI.getDebugLoc();
14918 // Ptr is used as the ptr_rc_no_r0 part
14919 // of LQ/STQ's memory operand and adding result of RA and RB,
14920 // so it has to be g8rc_and_g8rc_nox0.
14921 Register Ptr =
14922 F->getRegInfo().createVirtualRegister(RegClass: &PPC::G8RC_and_G8RC_NOX0RegClass);
14923 Register Val = MI.getOperand(i: 0).getReg();
14924 Register RA = MI.getOperand(i: 1).getReg();
14925 Register RB = MI.getOperand(i: 2).getReg();
14926 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::ADD8), DestReg: Ptr).addReg(RegNo: RA).addReg(RegNo: RB);
14927 BuildMI(BB&: *BB, I&: MI, MIMD: DL,
14928 MCID: MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(Opcode: PPC::LQ)
14929 : TII->get(Opcode: PPC::STQ))
14930 .addReg(RegNo: Val, Flags: getDefRegState(B: MI.getOpcode() == PPC::LQX_PSEUDO))
14931 .addImm(Val: 0)
14932 .addReg(RegNo: Ptr);
14933 break;
14934 }
14935 default:
14936 llvm_unreachable("Unexpected instr type to insert");
14937 }
14938
14939 MI.eraseFromParent(); // The pseudo instruction is gone now.
14940 return BB;
14941}
14942
14943//===----------------------------------------------------------------------===//
14944// Target Optimization Hooks
14945//===----------------------------------------------------------------------===//
14946
14947static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
14948 // For the estimates, convergence is quadratic, so we essentially double the
14949 // number of digits correct after every iteration. For both FRE and FRSQRTE,
14950 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
14951 // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
14952 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
14953 if (VT.getScalarType() == MVT::f64)
14954 RefinementSteps++;
14955 return RefinementSteps;
14956}
14957
14958SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
14959 const DenormalMode &Mode,
14960 SDNodeFlags Flags) const {
14961 // We only have VSX Vector Test for software Square Root.
14962 EVT VT = Op.getValueType();
14963 if (!isTypeLegal(VT: MVT::i1) ||
14964 (VT != MVT::f64 &&
14965 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
14966 return TargetLowering::getSqrtInputTest(Operand: Op, DAG, Mode, Flags);
14967
14968 SDLoc DL(Op);
14969 // The output register of FTSQRT is CR field.
14970 SDValue FTSQRT = DAG.getNode(Opcode: PPCISD::FTSQRT, DL, VT: MVT::i32, Operand: Op, Flags);
14971 // ftsqrt BF,FRB
14972 // Let e_b be the unbiased exponent of the double-precision
14973 // floating-point operand in register FRB.
14974 // fe_flag is set to 1 if either of the following conditions occurs.
14975 // - The double-precision floating-point operand in register FRB is a zero,
14976 // a NaN, or an infinity, or a negative value.
14977 // - e_b is less than or equal to -970.
14978 // Otherwise fe_flag is set to 0.
14979 // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
14980 // not eligible for iteration. (zero/negative/infinity/nan or unbiased
14981 // exponent is less than -970)
14982 SDValue SRIdxVal = DAG.getTargetConstant(Val: PPC::sub_eq, DL, VT: MVT::i32);
14983 return SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: DL, VT: MVT::i1,
14984 Op1: FTSQRT, Op2: SRIdxVal),
14985 0);
14986}
14987
14988SDValue
14989PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
14990 SelectionDAG &DAG) const {
14991 // We only have VSX Vector Square Root.
14992 EVT VT = Op.getValueType();
14993 if (VT != MVT::f64 &&
14994 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
14995 return TargetLowering::getSqrtResultForDenormInput(Operand: Op, DAG);
14996
14997 return DAG.getNode(Opcode: PPCISD::FSQRT, DL: SDLoc(Op), VT, Operand: Op);
14998}
14999
15000SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
15001 int Enabled, int &RefinementSteps,
15002 bool &UseOneConstNR,
15003 bool Reciprocal) const {
15004 EVT VT = Operand.getValueType();
15005 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
15006 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
15007 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
15008 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
15009 if (RefinementSteps == ReciprocalEstimate::Unspecified)
15010 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
15011
15012 // The Newton-Raphson computation with a single constant does not provide
15013 // enough accuracy on some CPUs.
15014 UseOneConstNR = !Subtarget.needsTwoConstNR();
15015 return DAG.getNode(Opcode: PPCISD::FRSQRTE, DL: SDLoc(Operand), VT, Operand);
15016 }
15017 return SDValue();
15018}
15019
15020SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
15021 int Enabled,
15022 int &RefinementSteps) const {
15023 EVT VT = Operand.getValueType();
15024 if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
15025 (VT == MVT::f64 && Subtarget.hasFRE()) ||
15026 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
15027 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
15028 if (RefinementSteps == ReciprocalEstimate::Unspecified)
15029 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
15030 return DAG.getNode(Opcode: PPCISD::FRE, DL: SDLoc(Operand), VT, Operand);
15031 }
15032 return SDValue();
15033}
15034
15035unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
15036 // Note: This functionality is used only when arcp is enabled, and
15037 // on cores with reciprocal estimates (which are used when arcp is
15038 // enabled for division), this functionality is redundant with the default
15039 // combiner logic (once the division -> reciprocal/multiply transformation
15040 // has taken place). As a result, this matters more for older cores than for
15041 // newer ones.
15042
15043 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
15044 // reciprocal if there are two or more FDIVs (for embedded cores with only
15045 // one FP pipeline) for three or more FDIVs (for generic OOO cores).
15046 switch (Subtarget.getCPUDirective()) {
15047 default:
15048 return 3;
15049 case PPC::DIR_440:
15050 case PPC::DIR_A2:
15051 case PPC::DIR_E500:
15052 case PPC::DIR_E500mc:
15053 case PPC::DIR_E5500:
15054 return 2;
15055 }
15056}
15057
15058// isConsecutiveLSLoc needs to work even if all adds have not yet been
15059// collapsed, and so we need to look through chains of them.
15060static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
15061 int64_t& Offset, SelectionDAG &DAG) {
15062 if (DAG.isBaseWithConstantOffset(Op: Loc)) {
15063 Base = Loc.getOperand(i: 0);
15064 Offset += cast<ConstantSDNode>(Val: Loc.getOperand(i: 1))->getSExtValue();
15065
15066 // The base might itself be a base plus an offset, and if so, accumulate
15067 // that as well.
15068 getBaseWithConstantOffset(Loc: Loc.getOperand(i: 0), Base, Offset, DAG);
15069 }
15070}
15071
15072static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
15073 unsigned Bytes, int Dist,
15074 SelectionDAG &DAG) {
15075 if (VT.getSizeInBits() / 8 != Bytes)
15076 return false;
15077
15078 SDValue BaseLoc = Base->getBasePtr();
15079 if (Loc.getOpcode() == ISD::FrameIndex) {
15080 if (BaseLoc.getOpcode() != ISD::FrameIndex)
15081 return false;
15082 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15083 int FI = cast<FrameIndexSDNode>(Val&: Loc)->getIndex();
15084 int BFI = cast<FrameIndexSDNode>(Val&: BaseLoc)->getIndex();
15085 int FS = MFI.getObjectSize(ObjectIdx: FI);
15086 int BFS = MFI.getObjectSize(ObjectIdx: BFI);
15087 if (FS != BFS || FS != (int)Bytes) return false;
15088 return MFI.getObjectOffset(ObjectIdx: FI) == (MFI.getObjectOffset(ObjectIdx: BFI) + Dist*Bytes);
15089 }
15090
15091 SDValue Base1 = Loc, Base2 = BaseLoc;
15092 int64_t Offset1 = 0, Offset2 = 0;
15093 getBaseWithConstantOffset(Loc, Base&: Base1, Offset&: Offset1, DAG);
15094 getBaseWithConstantOffset(Loc: BaseLoc, Base&: Base2, Offset&: Offset2, DAG);
15095 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
15096 return true;
15097
15098 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15099 const GlobalValue *GV1 = nullptr;
15100 const GlobalValue *GV2 = nullptr;
15101 Offset1 = 0;
15102 Offset2 = 0;
15103 bool isGA1 = TLI.isGAPlusOffset(N: Loc.getNode(), GA&: GV1, Offset&: Offset1);
15104 bool isGA2 = TLI.isGAPlusOffset(N: BaseLoc.getNode(), GA&: GV2, Offset&: Offset2);
15105 if (isGA1 && isGA2 && GV1 == GV2)
15106 return Offset1 == (Offset2 + Dist*Bytes);
15107 return false;
15108}
15109
15110// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
15111// not enforce equality of the chain operands.
15112static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
15113 unsigned Bytes, int Dist,
15114 SelectionDAG &DAG) {
15115 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Val: N)) {
15116 EVT VT = LS->getMemoryVT();
15117 SDValue Loc = LS->getBasePtr();
15118 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
15119 }
15120
15121 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
15122 EVT VT;
15123 switch (N->getConstantOperandVal(Num: 1)) {
15124 default: return false;
15125 case Intrinsic::ppc_altivec_lvx:
15126 case Intrinsic::ppc_altivec_lvxl:
15127 case Intrinsic::ppc_vsx_lxvw4x:
15128 case Intrinsic::ppc_vsx_lxvw4x_be:
15129 VT = MVT::v4i32;
15130 break;
15131 case Intrinsic::ppc_vsx_lxvd2x:
15132 case Intrinsic::ppc_vsx_lxvd2x_be:
15133 VT = MVT::v2f64;
15134 break;
15135 case Intrinsic::ppc_altivec_lvebx:
15136 VT = MVT::i8;
15137 break;
15138 case Intrinsic::ppc_altivec_lvehx:
15139 VT = MVT::i16;
15140 break;
15141 case Intrinsic::ppc_altivec_lvewx:
15142 VT = MVT::i32;
15143 break;
15144 }
15145
15146 return isConsecutiveLSLoc(Loc: N->getOperand(Num: 2), VT, Base, Bytes, Dist, DAG);
15147 }
15148
15149 if (N->getOpcode() == ISD::INTRINSIC_VOID) {
15150 EVT VT;
15151 switch (N->getConstantOperandVal(Num: 1)) {
15152 default: return false;
15153 case Intrinsic::ppc_altivec_stvx:
15154 case Intrinsic::ppc_altivec_stvxl:
15155 case Intrinsic::ppc_vsx_stxvw4x:
15156 VT = MVT::v4i32;
15157 break;
15158 case Intrinsic::ppc_vsx_stxvd2x:
15159 VT = MVT::v2f64;
15160 break;
15161 case Intrinsic::ppc_vsx_stxvw4x_be:
15162 VT = MVT::v4i32;
15163 break;
15164 case Intrinsic::ppc_vsx_stxvd2x_be:
15165 VT = MVT::v2f64;
15166 break;
15167 case Intrinsic::ppc_altivec_stvebx:
15168 VT = MVT::i8;
15169 break;
15170 case Intrinsic::ppc_altivec_stvehx:
15171 VT = MVT::i16;
15172 break;
15173 case Intrinsic::ppc_altivec_stvewx:
15174 VT = MVT::i32;
15175 break;
15176 }
15177
15178 return isConsecutiveLSLoc(Loc: N->getOperand(Num: 3), VT, Base, Bytes, Dist, DAG);
15179 }
15180
15181 return false;
15182}
15183
15184// Return true is there is a nearyby consecutive load to the one provided
15185// (regardless of alignment). We search up and down the chain, looking though
15186// token factors and other loads (but nothing else). As a result, a true result
15187// indicates that it is safe to create a new consecutive load adjacent to the
15188// load provided.
15189static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
15190 SDValue Chain = LD->getChain();
15191 EVT VT = LD->getMemoryVT();
15192
15193 SmallPtrSet<SDNode *, 16> LoadRoots;
15194 SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
15195 SmallPtrSet<SDNode *, 16> Visited;
15196
15197 // First, search up the chain, branching to follow all token-factor operands.
15198 // If we find a consecutive load, then we're done, otherwise, record all
15199 // nodes just above the top-level loads and token factors.
15200 while (!Queue.empty()) {
15201 SDNode *ChainNext = Queue.pop_back_val();
15202 if (!Visited.insert(Ptr: ChainNext).second)
15203 continue;
15204
15205 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(Val: ChainNext)) {
15206 if (isConsecutiveLS(N: ChainLD, Base: LD, Bytes: VT.getStoreSize(), Dist: 1, DAG))
15207 return true;
15208
15209 if (!Visited.count(Ptr: ChainLD->getChain().getNode()))
15210 Queue.push_back(Elt: ChainLD->getChain().getNode());
15211 } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
15212 for (const SDUse &O : ChainNext->ops())
15213 if (!Visited.count(Ptr: O.getNode()))
15214 Queue.push_back(Elt: O.getNode());
15215 } else
15216 LoadRoots.insert(Ptr: ChainNext);
15217 }
15218
15219 // Second, search down the chain, starting from the top-level nodes recorded
15220 // in the first phase. These top-level nodes are the nodes just above all
15221 // loads and token factors. Starting with their uses, recursively look though
15222 // all loads (just the chain uses) and token factors to find a consecutive
15223 // load.
15224 Visited.clear();
15225 Queue.clear();
15226
15227 for (SDNode *I : LoadRoots) {
15228 Queue.push_back(Elt: I);
15229
15230 while (!Queue.empty()) {
15231 SDNode *LoadRoot = Queue.pop_back_val();
15232 if (!Visited.insert(Ptr: LoadRoot).second)
15233 continue;
15234
15235 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(Val: LoadRoot))
15236 if (isConsecutiveLS(N: ChainLD, Base: LD, Bytes: VT.getStoreSize(), Dist: 1, DAG))
15237 return true;
15238
15239 for (SDNode *U : LoadRoot->users())
15240 if (((isa<MemSDNode>(Val: U) &&
15241 cast<MemSDNode>(Val: U)->getChain().getNode() == LoadRoot) ||
15242 U->getOpcode() == ISD::TokenFactor) &&
15243 !Visited.count(Ptr: U))
15244 Queue.push_back(Elt: U);
15245 }
15246 }
15247
15248 return false;
15249}
15250
15251/// This function is called when we have proved that a SETCC node can be replaced
15252/// by subtraction (and other supporting instructions) so that the result of
15253/// comparison is kept in a GPR instead of CR. This function is purely for
15254/// codegen purposes and has some flags to guide the codegen process.
15255static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
15256 bool Swap, SDLoc &DL, SelectionDAG &DAG) {
15257 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
15258
15259 // Zero extend the operands to the largest legal integer. Originally, they
15260 // must be of a strictly smaller size.
15261 auto Op0 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, N1: N->getOperand(Num: 0),
15262 N2: DAG.getConstant(Val: Size, DL, VT: MVT::i32));
15263 auto Op1 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, N1: N->getOperand(Num: 1),
15264 N2: DAG.getConstant(Val: Size, DL, VT: MVT::i32));
15265
15266 // Swap if needed. Depends on the condition code.
15267 if (Swap)
15268 std::swap(a&: Op0, b&: Op1);
15269
15270 // Subtract extended integers.
15271 auto SubNode = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Op0, N2: Op1);
15272
15273 // Move the sign bit to the least significant position and zero out the rest.
15274 // Now the least significant bit carries the result of original comparison.
15275 auto Shifted = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SubNode,
15276 N2: DAG.getConstant(Val: Size - 1, DL, VT: MVT::i32));
15277 auto Final = Shifted;
15278
15279 // Complement the result if needed. Based on the condition code.
15280 if (Complement)
15281 Final = DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64, N1: Shifted,
15282 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
15283
15284 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Final);
15285}
15286
15287SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
15288 DAGCombinerInfo &DCI) const {
15289 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
15290
15291 SelectionDAG &DAG = DCI.DAG;
15292 SDLoc DL(N);
15293
15294 // Size of integers being compared has a critical role in the following
15295 // analysis, so we prefer to do this when all types are legal.
15296 if (!DCI.isAfterLegalizeDAG())
15297 return SDValue();
15298
15299 // If all users of SETCC extend its value to a legal integer type
15300 // then we replace SETCC with a subtraction
15301 for (const SDNode *U : N->users())
15302 if (U->getOpcode() != ISD::ZERO_EXTEND)
15303 return SDValue();
15304
15305 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15306 auto OpSize = N->getOperand(Num: 0).getValueSizeInBits();
15307
15308 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
15309
15310 if (OpSize < Size) {
15311 switch (CC) {
15312 default: break;
15313 case ISD::SETULT:
15314 return generateEquivalentSub(N, Size, Complement: false, Swap: false, DL, DAG);
15315 case ISD::SETULE:
15316 return generateEquivalentSub(N, Size, Complement: true, Swap: true, DL, DAG);
15317 case ISD::SETUGT:
15318 return generateEquivalentSub(N, Size, Complement: false, Swap: true, DL, DAG);
15319 case ISD::SETUGE:
15320 return generateEquivalentSub(N, Size, Complement: true, Swap: false, DL, DAG);
15321 }
15322 }
15323
15324 return SDValue();
15325}
15326
15327SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
15328 DAGCombinerInfo &DCI) const {
15329 SelectionDAG &DAG = DCI.DAG;
15330 SDLoc dl(N);
15331
15332 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
15333 // If we're tracking CR bits, we need to be careful that we don't have:
15334 // trunc(binary-ops(zext(x), zext(y)))
15335 // or
15336 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
15337 // such that we're unnecessarily moving things into GPRs when it would be
15338 // better to keep them in CR bits.
15339
15340 // Note that trunc here can be an actual i1 trunc, or can be the effective
15341 // truncation that comes from a setcc or select_cc.
15342 if (N->getOpcode() == ISD::TRUNCATE &&
15343 N->getValueType(ResNo: 0) != MVT::i1)
15344 return SDValue();
15345
15346 if (N->getOperand(Num: 0).getValueType() != MVT::i32 &&
15347 N->getOperand(Num: 0).getValueType() != MVT::i64)
15348 return SDValue();
15349
15350 if (N->getOpcode() == ISD::SETCC ||
15351 N->getOpcode() == ISD::SELECT_CC) {
15352 // If we're looking at a comparison, then we need to make sure that the
15353 // high bits (all except for the first) don't matter the result.
15354 ISD::CondCode CC =
15355 cast<CondCodeSDNode>(Val: N->getOperand(
15356 Num: N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
15357 unsigned OpBits = N->getOperand(Num: 0).getValueSizeInBits();
15358
15359 if (ISD::isSignedIntSetCC(Code: CC)) {
15360 if (DAG.ComputeNumSignBits(Op: N->getOperand(Num: 0)) != OpBits ||
15361 DAG.ComputeNumSignBits(Op: N->getOperand(Num: 1)) != OpBits)
15362 return SDValue();
15363 } else if (ISD::isUnsignedIntSetCC(Code: CC)) {
15364 if (!DAG.MaskedValueIsZero(Op: N->getOperand(Num: 0),
15365 Mask: APInt::getHighBitsSet(numBits: OpBits, hiBitsSet: OpBits-1)) ||
15366 !DAG.MaskedValueIsZero(Op: N->getOperand(Num: 1),
15367 Mask: APInt::getHighBitsSet(numBits: OpBits, hiBitsSet: OpBits-1)))
15368 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
15369 : SDValue());
15370 } else {
15371 // This is neither a signed nor an unsigned comparison, just make sure
15372 // that the high bits are equal.
15373 KnownBits Op1Known = DAG.computeKnownBits(Op: N->getOperand(Num: 0));
15374 KnownBits Op2Known = DAG.computeKnownBits(Op: N->getOperand(Num: 1));
15375
15376 // We don't really care about what is known about the first bit (if
15377 // anything), so pretend that it is known zero for both to ensure they can
15378 // be compared as constants.
15379 Op1Known.Zero.setBit(0); Op1Known.One.clearBit(BitPosition: 0);
15380 Op2Known.Zero.setBit(0); Op2Known.One.clearBit(BitPosition: 0);
15381
15382 if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
15383 Op1Known.getConstant() != Op2Known.getConstant())
15384 return SDValue();
15385 }
15386 }
15387
15388 // We now know that the higher-order bits are irrelevant, we just need to
15389 // make sure that all of the intermediate operations are bit operations, and
15390 // all inputs are extensions.
15391 if (N->getOperand(Num: 0).getOpcode() != ISD::AND &&
15392 N->getOperand(Num: 0).getOpcode() != ISD::OR &&
15393 N->getOperand(Num: 0).getOpcode() != ISD::XOR &&
15394 N->getOperand(Num: 0).getOpcode() != ISD::SELECT &&
15395 N->getOperand(Num: 0).getOpcode() != ISD::SELECT_CC &&
15396 N->getOperand(Num: 0).getOpcode() != ISD::TRUNCATE &&
15397 N->getOperand(Num: 0).getOpcode() != ISD::SIGN_EXTEND &&
15398 N->getOperand(Num: 0).getOpcode() != ISD::ZERO_EXTEND &&
15399 N->getOperand(Num: 0).getOpcode() != ISD::ANY_EXTEND)
15400 return SDValue();
15401
15402 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
15403 N->getOperand(Num: 1).getOpcode() != ISD::AND &&
15404 N->getOperand(Num: 1).getOpcode() != ISD::OR &&
15405 N->getOperand(Num: 1).getOpcode() != ISD::XOR &&
15406 N->getOperand(Num: 1).getOpcode() != ISD::SELECT &&
15407 N->getOperand(Num: 1).getOpcode() != ISD::SELECT_CC &&
15408 N->getOperand(Num: 1).getOpcode() != ISD::TRUNCATE &&
15409 N->getOperand(Num: 1).getOpcode() != ISD::SIGN_EXTEND &&
15410 N->getOperand(Num: 1).getOpcode() != ISD::ZERO_EXTEND &&
15411 N->getOperand(Num: 1).getOpcode() != ISD::ANY_EXTEND)
15412 return SDValue();
15413
15414 SmallVector<SDValue, 4> Inputs;
15415 SmallVector<SDValue, 8> BinOps, PromOps;
15416 SmallPtrSet<SDNode *, 16> Visited;
15417
15418 for (unsigned i = 0; i < 2; ++i) {
15419 if (((N->getOperand(Num: i).getOpcode() == ISD::SIGN_EXTEND ||
15420 N->getOperand(Num: i).getOpcode() == ISD::ZERO_EXTEND ||
15421 N->getOperand(Num: i).getOpcode() == ISD::ANY_EXTEND) &&
15422 N->getOperand(Num: i).getOperand(i: 0).getValueType() == MVT::i1) ||
15423 isa<ConstantSDNode>(Val: N->getOperand(Num: i)))
15424 Inputs.push_back(Elt: N->getOperand(Num: i));
15425 else
15426 BinOps.push_back(Elt: N->getOperand(Num: i));
15427
15428 if (N->getOpcode() == ISD::TRUNCATE)
15429 break;
15430 }
15431
15432 // Visit all inputs, collect all binary operations (and, or, xor and
15433 // select) that are all fed by extensions.
15434 while (!BinOps.empty()) {
15435 SDValue BinOp = BinOps.pop_back_val();
15436
15437 if (!Visited.insert(Ptr: BinOp.getNode()).second)
15438 continue;
15439
15440 PromOps.push_back(Elt: BinOp);
15441
15442 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
15443 // The condition of the select is not promoted.
15444 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
15445 continue;
15446 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
15447 continue;
15448
15449 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
15450 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
15451 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
15452 BinOp.getOperand(i).getOperand(i: 0).getValueType() == MVT::i1) ||
15453 isa<ConstantSDNode>(Val: BinOp.getOperand(i))) {
15454 Inputs.push_back(Elt: BinOp.getOperand(i));
15455 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
15456 BinOp.getOperand(i).getOpcode() == ISD::OR ||
15457 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
15458 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
15459 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
15460 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
15461 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
15462 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
15463 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
15464 BinOps.push_back(Elt: BinOp.getOperand(i));
15465 } else {
15466 // We have an input that is not an extension or another binary
15467 // operation; we'll abort this transformation.
15468 return SDValue();
15469 }
15470 }
15471 }
15472
15473 // Make sure that this is a self-contained cluster of operations (which
15474 // is not quite the same thing as saying that everything has only one
15475 // use).
15476 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15477 if (isa<ConstantSDNode>(Val: Inputs[i]))
15478 continue;
15479
15480 for (const SDNode *User : Inputs[i].getNode()->users()) {
15481 if (User != N && !Visited.count(Ptr: User))
15482 return SDValue();
15483
15484 // Make sure that we're not going to promote the non-output-value
15485 // operand(s) or SELECT or SELECT_CC.
15486 // FIXME: Although we could sometimes handle this, and it does occur in
15487 // practice that one of the condition inputs to the select is also one of
15488 // the outputs, we currently can't deal with this.
15489 if (User->getOpcode() == ISD::SELECT) {
15490 if (User->getOperand(Num: 0) == Inputs[i])
15491 return SDValue();
15492 } else if (User->getOpcode() == ISD::SELECT_CC) {
15493 if (User->getOperand(Num: 0) == Inputs[i] ||
15494 User->getOperand(Num: 1) == Inputs[i])
15495 return SDValue();
15496 }
15497 }
15498 }
15499
15500 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
15501 for (const SDNode *User : PromOps[i].getNode()->users()) {
15502 if (User != N && !Visited.count(Ptr: User))
15503 return SDValue();
15504
15505 // Make sure that we're not going to promote the non-output-value
15506 // operand(s) or SELECT or SELECT_CC.
15507 // FIXME: Although we could sometimes handle this, and it does occur in
15508 // practice that one of the condition inputs to the select is also one of
15509 // the outputs, we currently can't deal with this.
15510 if (User->getOpcode() == ISD::SELECT) {
15511 if (User->getOperand(Num: 0) == PromOps[i])
15512 return SDValue();
15513 } else if (User->getOpcode() == ISD::SELECT_CC) {
15514 if (User->getOperand(Num: 0) == PromOps[i] ||
15515 User->getOperand(Num: 1) == PromOps[i])
15516 return SDValue();
15517 }
15518 }
15519 }
15520
15521 // Replace all inputs with the extension operand.
15522 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15523 // Constants may have users outside the cluster of to-be-promoted nodes,
15524 // and so we need to replace those as we do the promotions.
15525 if (isa<ConstantSDNode>(Val: Inputs[i]))
15526 continue;
15527 else
15528 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i], To: Inputs[i].getOperand(i: 0));
15529 }
15530
15531 std::list<HandleSDNode> PromOpHandles;
15532 for (auto &PromOp : PromOps)
15533 PromOpHandles.emplace_back(args&: PromOp);
15534
15535 // Replace all operations (these are all the same, but have a different
15536 // (i1) return type). DAG.getNode will validate that the types of
15537 // a binary operator match, so go through the list in reverse so that
15538 // we've likely promoted both operands first. Any intermediate truncations or
15539 // extensions disappear.
15540 while (!PromOpHandles.empty()) {
15541 SDValue PromOp = PromOpHandles.back().getValue();
15542 PromOpHandles.pop_back();
15543
15544 if (PromOp.getOpcode() == ISD::TRUNCATE ||
15545 PromOp.getOpcode() == ISD::SIGN_EXTEND ||
15546 PromOp.getOpcode() == ISD::ZERO_EXTEND ||
15547 PromOp.getOpcode() == ISD::ANY_EXTEND) {
15548 if (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: 0)) &&
15549 PromOp.getOperand(i: 0).getValueType() != MVT::i1) {
15550 // The operand is not yet ready (see comment below).
15551 PromOpHandles.emplace_front(args&: PromOp);
15552 continue;
15553 }
15554
15555 SDValue RepValue = PromOp.getOperand(i: 0);
15556 if (isa<ConstantSDNode>(Val: RepValue))
15557 RepValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: RepValue);
15558
15559 DAG.ReplaceAllUsesOfValueWith(From: PromOp, To: RepValue);
15560 continue;
15561 }
15562
15563 unsigned C;
15564 switch (PromOp.getOpcode()) {
15565 default: C = 0; break;
15566 case ISD::SELECT: C = 1; break;
15567 case ISD::SELECT_CC: C = 2; break;
15568 }
15569
15570 if ((!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C)) &&
15571 PromOp.getOperand(i: C).getValueType() != MVT::i1) ||
15572 (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C+1)) &&
15573 PromOp.getOperand(i: C+1).getValueType() != MVT::i1)) {
15574 // The to-be-promoted operands of this node have not yet been
15575 // promoted (this should be rare because we're going through the
15576 // list backward, but if one of the operands has several users in
15577 // this cluster of to-be-promoted nodes, it is possible).
15578 PromOpHandles.emplace_front(args&: PromOp);
15579 continue;
15580 }
15581
15582 SmallVector<SDValue, 3> Ops(PromOp.getNode()->ops());
15583
15584 // If there are any constant inputs, make sure they're replaced now.
15585 for (unsigned i = 0; i < 2; ++i)
15586 if (isa<ConstantSDNode>(Val: Ops[C+i]))
15587 Ops[C+i] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: Ops[C+i]);
15588
15589 DAG.ReplaceAllUsesOfValueWith(From: PromOp,
15590 To: DAG.getNode(Opcode: PromOp.getOpcode(), DL: dl, VT: MVT::i1, Ops));
15591 }
15592
15593 // Now we're left with the initial truncation itself.
15594 if (N->getOpcode() == ISD::TRUNCATE)
15595 return N->getOperand(Num: 0);
15596
15597 // Otherwise, this is a comparison. The operands to be compared have just
15598 // changed type (to i1), but everything else is the same.
15599 return SDValue(N, 0);
15600}
15601
15602SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
15603 DAGCombinerInfo &DCI) const {
15604 SelectionDAG &DAG = DCI.DAG;
15605 SDLoc dl(N);
15606
15607 // If we're tracking CR bits, we need to be careful that we don't have:
15608 // zext(binary-ops(trunc(x), trunc(y)))
15609 // or
15610 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
15611 // such that we're unnecessarily moving things into CR bits that can more
15612 // efficiently stay in GPRs. Note that if we're not certain that the high
15613 // bits are set as required by the final extension, we still may need to do
15614 // some masking to get the proper behavior.
15615
15616 // This same functionality is important on PPC64 when dealing with
15617 // 32-to-64-bit extensions; these occur often when 32-bit values are used as
15618 // the return values of functions. Because it is so similar, it is handled
15619 // here as well.
15620
15621 if (N->getValueType(ResNo: 0) != MVT::i32 &&
15622 N->getValueType(ResNo: 0) != MVT::i64)
15623 return SDValue();
15624
15625 if (!((N->getOperand(Num: 0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
15626 (N->getOperand(Num: 0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
15627 return SDValue();
15628
15629 if (N->getOperand(Num: 0).getOpcode() != ISD::AND &&
15630 N->getOperand(Num: 0).getOpcode() != ISD::OR &&
15631 N->getOperand(Num: 0).getOpcode() != ISD::XOR &&
15632 N->getOperand(Num: 0).getOpcode() != ISD::SELECT &&
15633 N->getOperand(Num: 0).getOpcode() != ISD::SELECT_CC)
15634 return SDValue();
15635
15636 SmallVector<SDValue, 4> Inputs;
15637 SmallVector<SDValue, 8> BinOps(1, N->getOperand(Num: 0)), PromOps;
15638 SmallPtrSet<SDNode *, 16> Visited;
15639
15640 // Visit all inputs, collect all binary operations (and, or, xor and
15641 // select) that are all fed by truncations.
15642 while (!BinOps.empty()) {
15643 SDValue BinOp = BinOps.pop_back_val();
15644
15645 if (!Visited.insert(Ptr: BinOp.getNode()).second)
15646 continue;
15647
15648 PromOps.push_back(Elt: BinOp);
15649
15650 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
15651 // The condition of the select is not promoted.
15652 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
15653 continue;
15654 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
15655 continue;
15656
15657 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
15658 isa<ConstantSDNode>(Val: BinOp.getOperand(i))) {
15659 Inputs.push_back(Elt: BinOp.getOperand(i));
15660 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
15661 BinOp.getOperand(i).getOpcode() == ISD::OR ||
15662 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
15663 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
15664 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
15665 BinOps.push_back(Elt: BinOp.getOperand(i));
15666 } else {
15667 // We have an input that is not a truncation or another binary
15668 // operation; we'll abort this transformation.
15669 return SDValue();
15670 }
15671 }
15672 }
15673
15674 // The operands of a select that must be truncated when the select is
15675 // promoted because the operand is actually part of the to-be-promoted set.
15676 DenseMap<SDNode *, EVT> SelectTruncOp[2];
15677
15678 // Make sure that this is a self-contained cluster of operations (which
15679 // is not quite the same thing as saying that everything has only one
15680 // use).
15681 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15682 if (isa<ConstantSDNode>(Val: Inputs[i]))
15683 continue;
15684
15685 for (SDNode *User : Inputs[i].getNode()->users()) {
15686 if (User != N && !Visited.count(Ptr: User))
15687 return SDValue();
15688
15689 // If we're going to promote the non-output-value operand(s) or SELECT or
15690 // SELECT_CC, record them for truncation.
15691 if (User->getOpcode() == ISD::SELECT) {
15692 if (User->getOperand(Num: 0) == Inputs[i])
15693 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15694 y: User->getOperand(Num: 0).getValueType()));
15695 } else if (User->getOpcode() == ISD::SELECT_CC) {
15696 if (User->getOperand(Num: 0) == Inputs[i])
15697 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15698 y: User->getOperand(Num: 0).getValueType()));
15699 if (User->getOperand(Num: 1) == Inputs[i])
15700 SelectTruncOp[1].insert(KV: std::make_pair(x&: User,
15701 y: User->getOperand(Num: 1).getValueType()));
15702 }
15703 }
15704 }
15705
15706 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
15707 for (SDNode *User : PromOps[i].getNode()->users()) {
15708 if (User != N && !Visited.count(Ptr: User))
15709 return SDValue();
15710
15711 // If we're going to promote the non-output-value operand(s) or SELECT or
15712 // SELECT_CC, record them for truncation.
15713 if (User->getOpcode() == ISD::SELECT) {
15714 if (User->getOperand(Num: 0) == PromOps[i])
15715 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15716 y: User->getOperand(Num: 0).getValueType()));
15717 } else if (User->getOpcode() == ISD::SELECT_CC) {
15718 if (User->getOperand(Num: 0) == PromOps[i])
15719 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15720 y: User->getOperand(Num: 0).getValueType()));
15721 if (User->getOperand(Num: 1) == PromOps[i])
15722 SelectTruncOp[1].insert(KV: std::make_pair(x&: User,
15723 y: User->getOperand(Num: 1).getValueType()));
15724 }
15725 }
15726 }
15727
15728 unsigned PromBits = N->getOperand(Num: 0).getValueSizeInBits();
15729 bool ReallyNeedsExt = false;
15730 if (N->getOpcode() != ISD::ANY_EXTEND) {
15731 // If all of the inputs are not already sign/zero extended, then
15732 // we'll still need to do that at the end.
15733 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15734 if (isa<ConstantSDNode>(Val: Inputs[i]))
15735 continue;
15736
15737 unsigned OpBits =
15738 Inputs[i].getOperand(i: 0).getValueSizeInBits();
15739 assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
15740
15741 if ((N->getOpcode() == ISD::ZERO_EXTEND &&
15742 !DAG.MaskedValueIsZero(Op: Inputs[i].getOperand(i: 0),
15743 Mask: APInt::getHighBitsSet(numBits: OpBits,
15744 hiBitsSet: OpBits-PromBits))) ||
15745 (N->getOpcode() == ISD::SIGN_EXTEND &&
15746 DAG.ComputeNumSignBits(Op: Inputs[i].getOperand(i: 0)) <
15747 (OpBits-(PromBits-1)))) {
15748 ReallyNeedsExt = true;
15749 break;
15750 }
15751 }
15752 }
15753
15754 // Convert PromOps to handles before doing any RAUW operations, as these
15755 // may CSE with existing nodes, deleting the originals.
15756 std::list<HandleSDNode> PromOpHandles;
15757 for (auto &PromOp : PromOps)
15758 PromOpHandles.emplace_back(args&: PromOp);
15759
15760 // Replace all inputs, either with the truncation operand, or a
15761 // truncation or extension to the final output type.
15762 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15763 // Constant inputs need to be replaced with the to-be-promoted nodes that
15764 // use them because they might have users outside of the cluster of
15765 // promoted nodes.
15766 if (isa<ConstantSDNode>(Val: Inputs[i]))
15767 continue;
15768
15769 SDValue InSrc = Inputs[i].getOperand(i: 0);
15770 if (Inputs[i].getValueType() == N->getValueType(ResNo: 0))
15771 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i], To: InSrc);
15772 else if (N->getOpcode() == ISD::SIGN_EXTEND)
15773 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15774 To: DAG.getSExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15775 else if (N->getOpcode() == ISD::ZERO_EXTEND)
15776 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15777 To: DAG.getZExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15778 else
15779 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15780 To: DAG.getAnyExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15781 }
15782
15783 // Replace all operations (these are all the same, but have a different
15784 // (promoted) return type). DAG.getNode will validate that the types of
15785 // a binary operator match, so go through the list in reverse so that
15786 // we've likely promoted both operands first.
15787 while (!PromOpHandles.empty()) {
15788 SDValue PromOp = PromOpHandles.back().getValue();
15789 PromOpHandles.pop_back();
15790
15791 unsigned C;
15792 switch (PromOp.getOpcode()) {
15793 default: C = 0; break;
15794 case ISD::SELECT: C = 1; break;
15795 case ISD::SELECT_CC: C = 2; break;
15796 }
15797
15798 if ((!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C)) &&
15799 PromOp.getOperand(i: C).getValueType() != N->getValueType(ResNo: 0)) ||
15800 (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C+1)) &&
15801 PromOp.getOperand(i: C+1).getValueType() != N->getValueType(ResNo: 0))) {
15802 // The to-be-promoted operands of this node have not yet been
15803 // promoted (this should be rare because we're going through the
15804 // list backward, but if one of the operands has several users in
15805 // this cluster of to-be-promoted nodes, it is possible).
15806 PromOpHandles.emplace_front(args&: PromOp);
15807 continue;
15808 }
15809
15810 // For SELECT and SELECT_CC nodes, we do a similar check for any
15811 // to-be-promoted comparison inputs.
15812 if (PromOp.getOpcode() == ISD::SELECT ||
15813 PromOp.getOpcode() == ISD::SELECT_CC) {
15814 if ((SelectTruncOp[0].count(Val: PromOp.getNode()) &&
15815 PromOp.getOperand(i: 0).getValueType() != N->getValueType(ResNo: 0)) ||
15816 (SelectTruncOp[1].count(Val: PromOp.getNode()) &&
15817 PromOp.getOperand(i: 1).getValueType() != N->getValueType(ResNo: 0))) {
15818 PromOpHandles.emplace_front(args&: PromOp);
15819 continue;
15820 }
15821 }
15822
15823 SmallVector<SDValue, 3> Ops(PromOp.getNode()->ops());
15824
15825 // If this node has constant inputs, then they'll need to be promoted here.
15826 for (unsigned i = 0; i < 2; ++i) {
15827 if (!isa<ConstantSDNode>(Val: Ops[C+i]))
15828 continue;
15829 if (Ops[C+i].getValueType() == N->getValueType(ResNo: 0))
15830 continue;
15831
15832 if (N->getOpcode() == ISD::SIGN_EXTEND)
15833 Ops[C+i] = DAG.getSExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15834 else if (N->getOpcode() == ISD::ZERO_EXTEND)
15835 Ops[C+i] = DAG.getZExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15836 else
15837 Ops[C+i] = DAG.getAnyExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15838 }
15839
15840 // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
15841 // truncate them again to the original value type.
15842 if (PromOp.getOpcode() == ISD::SELECT ||
15843 PromOp.getOpcode() == ISD::SELECT_CC) {
15844 auto SI0 = SelectTruncOp[0].find(Val: PromOp.getNode());
15845 if (SI0 != SelectTruncOp[0].end())
15846 Ops[0] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: SI0->second, Operand: Ops[0]);
15847 auto SI1 = SelectTruncOp[1].find(Val: PromOp.getNode());
15848 if (SI1 != SelectTruncOp[1].end())
15849 Ops[1] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: SI1->second, Operand: Ops[1]);
15850 }
15851
15852 DAG.ReplaceAllUsesOfValueWith(From: PromOp,
15853 To: DAG.getNode(Opcode: PromOp.getOpcode(), DL: dl, VT: N->getValueType(ResNo: 0), Ops));
15854 }
15855
15856 // Now we're left with the initial extension itself.
15857 if (!ReallyNeedsExt)
15858 return N->getOperand(Num: 0);
15859
15860 // To zero extend, just mask off everything except for the first bit (in the
15861 // i1 case).
15862 if (N->getOpcode() == ISD::ZERO_EXTEND)
15863 return DAG.getNode(Opcode: ISD::AND, DL: dl, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
15864 N2: DAG.getConstant(Val: APInt::getLowBitsSet(
15865 numBits: N->getValueSizeInBits(ResNo: 0), loBitsSet: PromBits),
15866 DL: dl, VT: N->getValueType(ResNo: 0)));
15867
15868 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
15869 "Invalid extension type");
15870 EVT ShiftAmountTy = getShiftAmountTy(LHSTy: N->getValueType(ResNo: 0), DL: DAG.getDataLayout());
15871 SDValue ShiftCst =
15872 DAG.getConstant(Val: N->getValueSizeInBits(ResNo: 0) - PromBits, DL: dl, VT: ShiftAmountTy);
15873 return DAG.getNode(
15874 Opcode: ISD::SRA, DL: dl, VT: N->getValueType(ResNo: 0),
15875 N1: DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0), N2: ShiftCst),
15876 N2: ShiftCst);
15877}
15878
15879// The function check a i128 load can convert to 16i8 load for Vcmpequb.
15880static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS, bool IsPPC64) {
15881
15882 auto isValidForConvert = [IsPPC64](SDValue &Operand) {
15883 if (!Operand.hasOneUse())
15884 return false;
15885
15886 if (Operand.getValueType() != MVT::i128)
15887 return false;
15888
15889 if (Operand.getOpcode() == ISD::Constant) {
15890 auto *C = cast<ConstantSDNode>(Val&: Operand);
15891 const APInt &Val = C->getAPIntValue();
15892 // On PPC64, comparing an i128 value loaded from memory against a
15893 // constant smaller than 2^16 is usually better left to scalar lowering.
15894 // In that case, the compare can be lowered using xori (since xori has a
15895 // 16-bit immediate field), which is cheaper than materializing a vector
15896 // constant and using vcmpequb.
15897 if (IsPPC64 && Val.ult(RHS: 1ULL << 16))
15898 return false;
15899 return true;
15900 }
15901
15902 auto *LoadNode = dyn_cast<LoadSDNode>(Val&: Operand);
15903 if (!LoadNode)
15904 return false;
15905
15906 // If memory operation is volatile, do not perform any
15907 // optimization or transformation. Volatile operations must be preserved
15908 // as written to ensure correct program behavior, so we return an empty
15909 // SDValue to indicate no action.
15910
15911 if (LoadNode->isVolatile())
15912 return false;
15913
15914 // Only combine loads if both use the unindexed addressing mode.
15915 // PowerPC AltiVec/VMX does not support vector loads or stores with
15916 // pre/post-increment addressing. Indexed modes may imply implicit
15917 // pointer updates, which are not compatible with AltiVec vector
15918 // instructions.
15919 if (LoadNode->getAddressingMode() != ISD::UNINDEXED)
15920 return false;
15921
15922 // Only combine loads if both are non-extending loads
15923 // (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or
15924 // ISD::SEXTLOAD) perform zero or sign extension, which may change the
15925 // loaded value's semantics and are not compatible with vector loads.
15926 if (LoadNode->getExtensionType() != ISD::NON_EXTLOAD)
15927 return false;
15928
15929 return true;
15930 };
15931
15932 return (isValidForConvert(LHS) && isValidForConvert(RHS));
15933}
15934
15935SDValue convertTwoLoadsAndCmpToVCMPEQUB(SelectionDAG &DAG, SDNode *N,
15936 const SDLoc &DL) {
15937
15938 assert(N->getOpcode() == ISD::SETCC && "Should be called with a SETCC node");
15939
15940 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15941 assert((CC == ISD::SETNE || CC == ISD::SETEQ) &&
15942 "CC mus be ISD::SETNE or ISD::SETEQ");
15943
15944 auto getV16i8Load = [&](const SDValue &Operand) {
15945 if (Operand.getOpcode() == ISD::Constant)
15946 return DAG.getBitcast(VT: MVT::v16i8, V: Operand);
15947
15948 assert(Operand.getOpcode() == ISD::LOAD && "Must be LoadSDNode here.");
15949
15950 auto *LoadNode = cast<LoadSDNode>(Val: Operand);
15951 // Create a new MachineMemOperand without range metadata.
15952 // Range metadata is only valid for integer scalar types, not vectors.
15953 // The original i128 load may have range metadata, but when we convert
15954 // to v16i8, that metadata is no longer semantically valid.
15955 MachineMemOperand *MMO = LoadNode->getMemOperand();
15956 MachineFunction &MF = DAG.getMachineFunction();
15957 MachineMemOperand *NewMMO = MF.getMachineMemOperand(
15958 PtrInfo: MMO->getPointerInfo(), F: MMO->getFlags(), Size: MMO->getSize(), BaseAlignment: MMO->getAlign(),
15959 AAInfo: MMO->getAAInfo(), Ranges: nullptr, SSID: MMO->getSyncScopeID(),
15960 Ordering: MMO->getSuccessOrdering(), FailureOrdering: MMO->getFailureOrdering());
15961 SDValue NewLoad = DAG.getLoad(VT: MVT::v16i8, dl: DL, Chain: LoadNode->getChain(),
15962 Ptr: LoadNode->getBasePtr(), MMO: NewMMO);
15963 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LoadNode, 1), To: NewLoad.getValue(R: 1));
15964 return NewLoad;
15965 };
15966
15967 // Following code transforms the DAG
15968 // t0: ch,glue = EntryToken
15969 // t2: i64,ch = CopyFromReg t0, Register:i64 %0
15970 // t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
15971 // undef:i64
15972 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
15973 // t5: i128,ch =
15974 // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 =
15975 // setcc t3, t5, setne:ch
15976 //
15977 // ---->
15978 //
15979 // t0: ch,glue = EntryToken
15980 // t2: i64,ch = CopyFromReg t0, Register:i64 %0
15981 // t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
15982 // undef:i64
15983 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
15984 // t5: v16i8,ch =
15985 // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64
15986 // t6: i32 =
15987 // llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>,
15988 // Constant:i32<2>, t3, t5
15989 // t7: i1 = setcc t6, Constant:i32<0>, seteq:ch
15990
15991 // Or transforms the DAG
15992 // t5: i128,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
15993 // t8: i1 =
15994 // setcc Constant:i128<237684487579686500932345921536>, t5, setne:ch
15995 //
15996 // --->
15997 //
15998 // t5: v16i8,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
15999 // t6: v16i8 = bitcast Constant:i128<237684487579686500932345921536>
16000 // t7: i32 =
16001 // llvm.ppc.altivec.vcmpequb.p Constant:i32<10962>, Constant:i32<2>, t5, t2
16002
16003 SDValue LHSVec = getV16i8Load(N->getOperand(Num: 0));
16004 SDValue RHSVec = getV16i8Load(N->getOperand(Num: 1));
16005
16006 SDValue IntrID =
16007 DAG.getConstant(Val: Intrinsic::ppc_altivec_vcmpequb_p, DL, VT: MVT::i32);
16008 SDValue CRSel = DAG.getConstant(Val: 2, DL, VT: MVT::i32); // which CR6 predicate field
16009 SDValue PredResult = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
16010 N1: IntrID, N2: CRSel, N3: LHSVec, N4: RHSVec);
16011 // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
16012 // so we need to invert the CC opcode.
16013 return DAG.getSetCC(DL, VT: N->getValueType(ResNo: 0), LHS: PredResult,
16014 RHS: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
16015 Cond: CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
16016}
16017
16018// Detect whether there is a pattern like (setcc (and X, 1), 0, eq).
16019// If it is , return true; otherwise return false.
16020static bool canConvertSETCCToXori(SDNode *N) {
16021 assert(N->getOpcode() == ISD::SETCC && "Should be SETCC SDNode here.");
16022
16023 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
16024 if (CC != ISD::SETEQ)
16025 return false;
16026
16027 SDValue LHS = N->getOperand(Num: 0);
16028 SDValue RHS = N->getOperand(Num: 1);
16029
16030 // Check the `SDValue &V` is from `and` with `1`.
16031 auto IsAndWithOne = [](SDValue &V) {
16032 if (V.getOpcode() == ISD::AND) {
16033 for (const SDValue &Op : V->ops())
16034 if (auto *C = dyn_cast<ConstantSDNode>(Val: Op))
16035 if (C->isOne())
16036 return true;
16037 }
16038 return false;
16039 };
16040
16041 // Check whether the SETCC compare with zero.
16042 auto IsCompareWithZero = [](SDValue &V) {
16043 if (auto *C = dyn_cast<ConstantSDNode>(Val&: V))
16044 if (C->isZero())
16045 return true;
16046 return false;
16047 };
16048
16049 return (IsAndWithOne(LHS) && IsCompareWithZero(RHS)) ||
16050 (IsAndWithOne(RHS) && IsCompareWithZero(LHS));
16051}
16052
16053// You must check whether the `SDNode* N` can be converted to Xori using
16054// the function `static bool canConvertSETCCToXori(SDNode *N)`
16055// before calling the function; otherwise, it may produce incorrect results.
16056static SDValue ConvertSETCCToXori(SDNode *N, SelectionDAG &DAG) {
16057
16058 assert(N->getOpcode() == ISD::SETCC && "Should be SETCC SDNode here.");
16059 SDValue LHS = N->getOperand(Num: 0);
16060 SDValue RHS = N->getOperand(Num: 1);
16061 SDLoc DL(N);
16062
16063 [[maybe_unused]] ISD::CondCode CC =
16064 cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
16065 assert((CC == ISD::SETEQ) && "CC must be ISD::SETEQ.");
16066 // Rewrite it as XORI (and X, 1), 1.
16067 auto MakeXor1 = [&](SDValue V) {
16068 EVT VT = V.getValueType();
16069 SDValue One = DAG.getConstant(Val: 1, DL, VT);
16070 SDValue Xor = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: V, N2: One);
16071 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Xor);
16072 };
16073
16074 if (LHS.getOpcode() == ISD::AND && RHS.getOpcode() != ISD::AND)
16075 return MakeXor1(LHS);
16076
16077 if (RHS.getOpcode() == ISD::AND && LHS.getOpcode() != ISD::AND)
16078 return MakeXor1(RHS);
16079
16080 llvm_unreachable("Should not reach here.");
16081}
16082
16083// Match `sext(setcc X, 0, eq)` and turn it into an ADDIC/SUBFE sequence.
16084//
16085// This generates code for:
16086// X == 0 ? -1 : 0
16087//
16088// On pre-ISA 3.1 targets, this is better than the longer CNTLZW/SRWI/NEG
16089// sequence. This is useful for cases like:
16090// uint8_t f(uint8_t x) { return (x == 0) ? -1 : 0; }
16091//
16092// ISA 3.1+ is skipped because those targets can use SETBC.
16093
16094SDValue PPCTargetLowering::combineSignExtendSetCC(SDNode *N,
16095 DAGCombinerInfo &DCI) const {
16096 if (Subtarget.isISA3_1())
16097 return SDValue();
16098
16099 EVT VT = N->getValueType(ResNo: 0);
16100 if (VT != MVT::i32 && VT != MVT::i64)
16101 return SDValue();
16102
16103 SDValue N0 = N->getOperand(Num: 0);
16104 if (N0.getOpcode() != ISD::SETCC)
16105 return SDValue();
16106
16107 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N0.getOperand(i: 2))->get();
16108 SDValue LHS = N0.getOperand(i: 0);
16109 SDValue RHS = N0.getOperand(i: 1);
16110
16111 // Not match: sext (setcc x, 0, eq) or sext (setcc 0, x, eq)
16112 if (CC != ISD::SETEQ || (!isNullConstant(V: LHS) && !isNullConstant(V: RHS)))
16113 return SDValue();
16114
16115 SDLoc dl(N);
16116 SelectionDAG &DAG = DCI.DAG;
16117 SDValue X = isNullConstant(V: LHS) ? RHS : LHS;
16118 EVT XVT = X.getValueType(); // The type of x in the setcc x, 0, eq.
16119
16120 if ((XVT == MVT::i64 || VT == MVT::i64) && !Subtarget.isPPC64())
16121 return SDValue();
16122
16123 // On PPC64, i32 carry operations use the full 64-bit XER register,
16124 // so we must use i64 operations to avoid incorrect results.
16125 // Use i64 operations and truncate the result if needed.
16126 if (XVT != MVT::i64 && Subtarget.isPPC64())
16127 // Zero-extend if input type is not 64bits.
16128 X = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i64, Operand: X);
16129
16130 EVT OpVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
16131
16132 // Generate: SUBFE(ADDC(X, -1)).
16133 SDValue MinusOne = DAG.getAllOnesConstant(DL: dl, VT: OpVT);
16134 SDValue Addc =
16135 DAG.getNode(Opcode: PPCISD::ADDC, DL: dl, VTList: DAG.getVTList(VT1: OpVT, VT2: MVT::i32), N1: X, N2: MinusOne);
16136 SDValue Carry = Addc.getValue(R: 1);
16137 SDValue Sube = DAG.getNode(Opcode: PPCISD::SUBE, DL: dl, VTList: DAG.getVTList(VT1: OpVT, VT2: MVT::i32),
16138 N1: Addc, N2: Addc, N3: Carry);
16139
16140 // Truncate back to i32 if we used i64 operations.
16141 if (OpVT == MVT::i64 && VT == MVT::i32)
16142 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Sube);
16143
16144 return Sube;
16145}
16146
16147SDValue PPCTargetLowering::combineSetCC(SDNode *N,
16148 DAGCombinerInfo &DCI) const {
16149 assert(N->getOpcode() == ISD::SETCC &&
16150 "Should be called with a SETCC node");
16151
16152 // Check if the pattern (setcc (and X, 1), 0, eq) is present.
16153 // If it is, rewrite it as XORI (and X, 1), 1.
16154 if (canConvertSETCCToXori(N))
16155 return ConvertSETCCToXori(N, DAG&: DCI.DAG);
16156
16157 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
16158 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
16159 SDValue LHS = N->getOperand(Num: 0);
16160 SDValue RHS = N->getOperand(Num: 1);
16161
16162 // If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
16163 if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
16164 LHS.hasOneUse())
16165 std::swap(a&: LHS, b&: RHS);
16166
16167 // x == 0-y --> x+y == 0
16168 // x != 0-y --> x+y != 0
16169 if (RHS.getOpcode() == ISD::SUB && isNullConstant(V: RHS.getOperand(i: 0)) &&
16170 RHS.hasOneUse()) {
16171 SDLoc DL(N);
16172 SelectionDAG &DAG = DCI.DAG;
16173 EVT VT = N->getValueType(ResNo: 0);
16174 EVT OpVT = LHS.getValueType();
16175 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: OpVT, N1: LHS, N2: RHS.getOperand(i: 1));
16176 return DAG.getSetCC(DL, VT, LHS: Add, RHS: DAG.getConstant(Val: 0, DL, VT: OpVT), Cond: CC);
16177 }
16178
16179 // Optimization: Fold i128 equality/inequality compares of two loads into a
16180 // vectorized compare using vcmpequb.p when Altivec is available.
16181 //
16182 // Rationale:
16183 // A scalar i128 SETCC (eq/ne) normally lowers to multiple scalar ops.
16184 // On VSX-capable subtargets, we can instead reinterpret the i128 loads
16185 // as v16i8 vectors and use the Altive vcmpequb.p instruction to
16186 // perform a full 128-bit equality check in a single vector compare.
16187 //
16188 // Example Result:
16189 // This transformation replaces memcmp(a, b, 16) with two vector loads
16190 // and one vector compare instruction.
16191
16192 if (Subtarget.hasAltivec() &&
16193 canConvertToVcmpequb(LHS, RHS, IsPPC64: Subtarget.isPPC64()))
16194 return convertTwoLoadsAndCmpToVCMPEQUB(DAG&: DCI.DAG, N, DL: SDLoc(N));
16195 }
16196
16197 return DAGCombineTruncBoolExt(N, DCI);
16198}
16199
16200// Is this an extending load from an f32 to an f64?
16201static bool isFPExtLoad(SDValue Op) {
16202 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: Op.getNode()))
16203 return LD->getExtensionType() == ISD::EXTLOAD &&
16204 Op.getValueType() == MVT::f64;
16205 return false;
16206}
16207
16208/// Reduces the number of fp-to-int conversion when building a vector.
16209///
16210/// If this vector is built out of floating to integer conversions,
16211/// transform it to a vector built out of floating point values followed by a
16212/// single floating to integer conversion of the vector.
16213/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
16214/// becomes (fptosi (build_vector ($A, $B, ...)))
16215SDValue PPCTargetLowering::
16216combineElementTruncationToVectorTruncation(SDNode *N,
16217 DAGCombinerInfo &DCI) const {
16218 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16219 "Should be called with a BUILD_VECTOR node");
16220
16221 SelectionDAG &DAG = DCI.DAG;
16222 SDLoc dl(N);
16223
16224 SDValue FirstInput = N->getOperand(Num: 0);
16225 assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
16226 "The input operand must be an fp-to-int conversion.");
16227
16228 // This combine happens after legalization so the fp_to_[su]i nodes are
16229 // already converted to PPCSISD nodes.
16230 unsigned FirstConversion = FirstInput.getOperand(i: 0).getOpcode();
16231 if (FirstConversion == PPCISD::FCTIDZ ||
16232 FirstConversion == PPCISD::FCTIDUZ ||
16233 FirstConversion == PPCISD::FCTIWZ ||
16234 FirstConversion == PPCISD::FCTIWUZ) {
16235 bool IsSplat = true;
16236 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
16237 FirstConversion == PPCISD::FCTIWUZ;
16238 EVT SrcVT = FirstInput.getOperand(i: 0).getValueType();
16239 SmallVector<SDValue, 4> Ops;
16240 EVT TargetVT = N->getValueType(ResNo: 0);
16241 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
16242 SDValue NextOp = N->getOperand(Num: i);
16243 if (NextOp.getOpcode() != PPCISD::MFVSR)
16244 return SDValue();
16245 unsigned NextConversion = NextOp.getOperand(i: 0).getOpcode();
16246 if (NextConversion != FirstConversion)
16247 return SDValue();
16248 // If we are converting to 32-bit integers, we need to add an FP_ROUND.
16249 // This is not valid if the input was originally double precision. It is
16250 // also not profitable to do unless this is an extending load in which
16251 // case doing this combine will allow us to combine consecutive loads.
16252 if (Is32Bit && !isFPExtLoad(Op: NextOp.getOperand(i: 0).getOperand(i: 0)))
16253 return SDValue();
16254 if (N->getOperand(Num: i) != FirstInput)
16255 IsSplat = false;
16256 }
16257
16258 // If this is a splat, we leave it as-is since there will be only a single
16259 // fp-to-int conversion followed by a splat of the integer. This is better
16260 // for 32-bit and smaller ints and neutral for 64-bit ints.
16261 if (IsSplat)
16262 return SDValue();
16263
16264 // Now that we know we have the right type of node, get its operands
16265 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
16266 SDValue In = N->getOperand(Num: i).getOperand(i: 0);
16267 if (Is32Bit) {
16268 // For 32-bit values, we need to add an FP_ROUND node (if we made it
16269 // here, we know that all inputs are extending loads so this is safe).
16270 if (In.isUndef())
16271 Ops.push_back(Elt: DAG.getUNDEF(VT: SrcVT));
16272 else {
16273 SDValue Trunc =
16274 DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: In.getOperand(i: 0),
16275 N2: DAG.getIntPtrConstant(Val: 1, DL: dl, /*isTarget=*/true));
16276 Ops.push_back(Elt: Trunc);
16277 }
16278 } else
16279 Ops.push_back(Elt: In.isUndef() ? DAG.getUNDEF(VT: SrcVT) : In.getOperand(i: 0));
16280 }
16281
16282 unsigned Opcode;
16283 if (FirstConversion == PPCISD::FCTIDZ ||
16284 FirstConversion == PPCISD::FCTIWZ)
16285 Opcode = ISD::FP_TO_SINT;
16286 else
16287 Opcode = ISD::FP_TO_UINT;
16288
16289 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
16290 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: dl, Ops);
16291 return DAG.getNode(Opcode, DL: dl, VT: TargetVT, Operand: BV);
16292 }
16293 return SDValue();
16294}
16295
16296// LXVKQ instruction load VSX vector with a special quadword value
16297// based on an immediate value. This helper method returns the details of the
16298// match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
16299// to help generate the LXVKQ instruction and the subsequent shift instruction
16300// required to match the original build vector pattern.
16301
16302// LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
16303using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
16304
16305static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
16306
16307 // LXVKQ instruction loads the Quadword value:
16308 // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
16309 static const APInt BasePattern = APInt(128, 0x8000000000000000ULL) << 64;
16310 static const uint32_t Uim = 16;
16311
16312 // Check for direct LXVKQ match (no shift needed)
16313 if (FullVal == BasePattern)
16314 return std::make_tuple(args: Uim, args: uint8_t{0});
16315
16316 // Check if FullValue is 1 (the result of the base pattern >> 127)
16317 if (FullVal == APInt(128, 1))
16318 return std::make_tuple(args: Uim, args: uint8_t{127});
16319
16320 return std::nullopt;
16321}
16322
16323/// Combine vector loads to a single load (using lxvkq) or splat with shift of a
16324/// constant (xxspltib + vsrq) by recognising patterns in the Build Vector.
16325/// LXVKQ instruction load VSX vector with a special quadword value based on an
16326/// immediate value. if UIM=0b10000 then LXVKQ loads VSR[32×TX+T] with value
16327/// 0x8000_0000_0000_0000_0000_0000_0000_0000.
16328/// This can be used to inline the build vector constants that have the
16329/// following patterns:
16330///
16331/// 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
16332/// 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
16333/// MSB pattern can directly loaded using LXVKQ while LSB is loaded using a
16334/// combination of splatting and right shift instructions.
16335
16336SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
16337 SelectionDAG &DAG) const {
16338
16339 assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
16340 "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
16341
16342 // This transformation is only supported if we are loading either a byte,
16343 // halfword, word, or doubleword.
16344 EVT VT = Op.getValueType();
16345 if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
16346 VT == MVT::v2i64))
16347 return SDValue();
16348
16349 LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
16350 << VT.getEVTString() << "): ";
16351 Op->dump());
16352
16353 unsigned NumElems = VT.getVectorNumElements();
16354 unsigned ElemBits = VT.getScalarSizeInBits();
16355
16356 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
16357
16358 // Check for Non-constant operand in the build vector.
16359 for (const SDValue &Operand : Op.getNode()->op_values()) {
16360 if (!isa<ConstantSDNode>(Val: Operand))
16361 return SDValue();
16362 }
16363
16364 // Assemble build vector operands as a 128-bit register value
16365 // We need to reconstruct what the 128-bit register pattern would be
16366 // that produces this vector when interpreted with the current endianness
16367 APInt FullVal = APInt::getZero(numBits: 128);
16368
16369 for (unsigned Index = 0; Index < NumElems; ++Index) {
16370 auto *C = cast<ConstantSDNode>(Val: Op.getOperand(i: Index));
16371
16372 // Get element value as raw bits (zero-extended)
16373 uint64_t ElemValue = C->getZExtValue();
16374
16375 // Mask to element size to ensure we only get the relevant bits
16376 if (ElemBits < 64)
16377 ElemValue &= ((1ULL << ElemBits) - 1);
16378
16379 // Calculate bit position for this element in the 128-bit register
16380 unsigned BitPos =
16381 (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
16382
16383 // Create APInt for the element value and shift it to correct position
16384 APInt ElemAPInt(128, ElemValue);
16385 ElemAPInt <<= BitPos;
16386
16387 // Place the element value at the correct bit position
16388 FullVal |= ElemAPInt;
16389 }
16390
16391 if (FullVal.isZero() || FullVal.isAllOnes())
16392 return SDValue();
16393
16394 if (auto UIMOpt = getPatternInfo(FullVal)) {
16395 const auto &[Uim, ShiftAmount] = *UIMOpt;
16396 SDLoc Dl(Op);
16397
16398 // Generate LXVKQ instruction if the shift amount is zero.
16399 if (ShiftAmount == 0) {
16400 SDValue UimVal = DAG.getTargetConstant(Val: Uim, DL: Dl, VT: MVT::i32);
16401 SDValue LxvkqInstr =
16402 SDValue(DAG.getMachineNode(Opcode: PPC::LXVKQ, dl: Dl, VT, Op1: UimVal), 0);
16403 LLVM_DEBUG(llvm::dbgs()
16404 << "combineBVLoadsSpecialValue: Instruction Emitted ";
16405 LxvkqInstr.dump());
16406 return LxvkqInstr;
16407 }
16408
16409 assert(ShiftAmount == 127 && "Unexpected lxvkq shift amount value");
16410
16411 // The right shifted pattern can be constructed using a combination of
16412 // XXSPLTIB and VSRQ instruction. VSRQ uses the shift amount from the lower
16413 // 7 bits of byte 15. This can be specified using XXSPLTIB with immediate
16414 // value 255.
16415 SDValue ShiftAmountVec =
16416 SDValue(DAG.getMachineNode(Opcode: PPC::XXSPLTIB, dl: Dl, VT: MVT::v4i32,
16417 Op1: DAG.getTargetConstant(Val: 255, DL: Dl, VT: MVT::i32)),
16418 0);
16419 // Generate appropriate right shift instruction
16420 SDValue ShiftVec = SDValue(
16421 DAG.getMachineNode(Opcode: PPC::VSRQ, dl: Dl, VT, Op1: ShiftAmountVec, Op2: ShiftAmountVec),
16422 0);
16423 LLVM_DEBUG(llvm::dbgs()
16424 << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
16425 ShiftVec.dump());
16426 return ShiftVec;
16427 }
16428 // No patterns matched for build vectors.
16429 return SDValue();
16430}
16431
16432/// Reduce the number of loads when building a vector.
16433///
16434/// Building a vector out of multiple loads can be converted to a load
16435/// of the vector type if the loads are consecutive. If the loads are
16436/// consecutive but in descending order, a shuffle is added at the end
16437/// to reorder the vector.
16438static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
16439 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16440 "Should be called with a BUILD_VECTOR node");
16441
16442 SDLoc dl(N);
16443
16444 // Return early for non byte-sized type, as they can't be consecutive.
16445 if (!N->getValueType(ResNo: 0).getVectorElementType().isByteSized())
16446 return SDValue();
16447
16448 bool InputsAreConsecutiveLoads = true;
16449 bool InputsAreReverseConsecutive = true;
16450 unsigned ElemSize = N->getValueType(ResNo: 0).getScalarType().getStoreSize();
16451 SDValue FirstInput = N->getOperand(Num: 0);
16452 bool IsRoundOfExtLoad = false;
16453 LoadSDNode *FirstLoad = nullptr;
16454
16455 if (FirstInput.getOpcode() == ISD::FP_ROUND &&
16456 FirstInput.getOperand(i: 0).getOpcode() == ISD::LOAD) {
16457 FirstLoad = cast<LoadSDNode>(Val: FirstInput.getOperand(i: 0));
16458 IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD;
16459 }
16460 // Not a build vector of (possibly fp_rounded) loads.
16461 if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
16462 N->getNumOperands() == 1)
16463 return SDValue();
16464
16465 if (!IsRoundOfExtLoad)
16466 FirstLoad = cast<LoadSDNode>(Val&: FirstInput);
16467
16468 SmallVector<LoadSDNode *, 4> InputLoads;
16469 InputLoads.push_back(Elt: FirstLoad);
16470 for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
16471 // If any inputs are fp_round(extload), they all must be.
16472 if (IsRoundOfExtLoad && N->getOperand(Num: i).getOpcode() != ISD::FP_ROUND)
16473 return SDValue();
16474
16475 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(Num: i).getOperand(i: 0) :
16476 N->getOperand(Num: i);
16477 if (NextInput.getOpcode() != ISD::LOAD)
16478 return SDValue();
16479
16480 SDValue PreviousInput =
16481 IsRoundOfExtLoad ? N->getOperand(Num: i-1).getOperand(i: 0) : N->getOperand(Num: i-1);
16482 LoadSDNode *LD1 = cast<LoadSDNode>(Val&: PreviousInput);
16483 LoadSDNode *LD2 = cast<LoadSDNode>(Val&: NextInput);
16484
16485 // If any inputs are fp_round(extload), they all must be.
16486 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
16487 return SDValue();
16488
16489 // We only care about regular loads. The PPC-specific load intrinsics
16490 // will not lead to a merge opportunity.
16491 if (!DAG.areNonVolatileConsecutiveLoads(LD: LD2, Base: LD1, Bytes: ElemSize, Dist: 1))
16492 InputsAreConsecutiveLoads = false;
16493 if (!DAG.areNonVolatileConsecutiveLoads(LD: LD1, Base: LD2, Bytes: ElemSize, Dist: 1))
16494 InputsAreReverseConsecutive = false;
16495
16496 // Exit early if the loads are neither consecutive nor reverse consecutive.
16497 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
16498 return SDValue();
16499 InputLoads.push_back(Elt: LD2);
16500 }
16501
16502 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
16503 "The loads cannot be both consecutive and reverse consecutive.");
16504
16505 SDValue WideLoad;
16506 SDValue ReturnSDVal;
16507 if (InputsAreConsecutiveLoads) {
16508 assert(FirstLoad && "Input needs to be a LoadSDNode.");
16509 WideLoad = DAG.getLoad(VT: N->getValueType(ResNo: 0), dl, Chain: FirstLoad->getChain(),
16510 Ptr: FirstLoad->getBasePtr(), PtrInfo: FirstLoad->getPointerInfo(),
16511 Alignment: FirstLoad->getAlign());
16512 ReturnSDVal = WideLoad;
16513 } else if (InputsAreReverseConsecutive) {
16514 LoadSDNode *LastLoad = InputLoads.back();
16515 assert(LastLoad && "Input needs to be a LoadSDNode.");
16516 WideLoad = DAG.getLoad(VT: N->getValueType(ResNo: 0), dl, Chain: LastLoad->getChain(),
16517 Ptr: LastLoad->getBasePtr(), PtrInfo: LastLoad->getPointerInfo(),
16518 Alignment: LastLoad->getAlign());
16519 SmallVector<int, 16> Ops;
16520 for (int i = N->getNumOperands() - 1; i >= 0; i--)
16521 Ops.push_back(Elt: i);
16522
16523 ReturnSDVal = DAG.getVectorShuffle(VT: N->getValueType(ResNo: 0), dl, N1: WideLoad,
16524 N2: DAG.getUNDEF(VT: N->getValueType(ResNo: 0)), Mask: Ops);
16525 } else
16526 return SDValue();
16527
16528 for (auto *LD : InputLoads)
16529 DAG.makeEquivalentMemoryOrdering(OldLoad: LD, NewMemOp: WideLoad);
16530 return ReturnSDVal;
16531}
16532
16533// This function adds the required vector_shuffle needed to get
16534// the elements of the vector extract in the correct position
16535// as specified by the CorrectElems encoding.
16536static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
16537 SDValue Input, uint64_t Elems,
16538 uint64_t CorrectElems) {
16539 SDLoc dl(N);
16540
16541 unsigned NumElems = Input.getValueType().getVectorNumElements();
16542 SmallVector<int, 16> ShuffleMask(NumElems, -1);
16543
16544 // Knowing the element indices being extracted from the original
16545 // vector and the order in which they're being inserted, just put
16546 // them at element indices required for the instruction.
16547 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16548 if (DAG.getDataLayout().isLittleEndian())
16549 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
16550 else
16551 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
16552 CorrectElems = CorrectElems >> 8;
16553 Elems = Elems >> 8;
16554 }
16555
16556 SDValue Shuffle =
16557 DAG.getVectorShuffle(VT: Input.getValueType(), dl, N1: Input,
16558 N2: DAG.getUNDEF(VT: Input.getValueType()), Mask: ShuffleMask);
16559
16560 EVT VT = N->getValueType(ResNo: 0);
16561 SDValue Conv = DAG.getBitcast(VT, V: Shuffle);
16562
16563 EVT ExtVT = EVT::getVectorVT(Context&: *DAG.getContext(),
16564 VT: Input.getValueType().getVectorElementType(),
16565 NumElements: VT.getVectorNumElements());
16566 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT, N1: Conv,
16567 N2: DAG.getValueType(ExtVT));
16568}
16569
16570// Look for build vector patterns where input operands come from sign
16571// extended vector_extract elements of specific indices. If the correct indices
16572// aren't used, add a vector shuffle to fix up the indices and create
16573// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
16574// during instruction selection.
16575static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
16576 // This array encodes the indices that the vector sign extend instructions
16577 // extract from when extending from one type to another for both BE and LE.
16578 // The right nibble of each byte corresponds to the LE incides.
16579 // and the left nibble of each byte corresponds to the BE incides.
16580 // For example: 0x3074B8FC byte->word
16581 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
16582 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
16583 // For example: 0x000070F8 byte->double word
16584 // For LE: the allowed indices are: 0x0,0x8
16585 // For BE: the allowed indices are: 0x7,0xF
16586 uint64_t TargetElems[] = {
16587 0x3074B8FC, // b->w
16588 0x000070F8, // b->d
16589 0x10325476, // h->w
16590 0x00003074, // h->d
16591 0x00001032, // w->d
16592 };
16593
16594 uint64_t Elems = 0;
16595 int Index;
16596 SDValue Input;
16597
16598 auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
16599 if (!Op)
16600 return false;
16601 if (Op.getOpcode() != ISD::SIGN_EXTEND &&
16602 Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
16603 return false;
16604
16605 // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
16606 // of the right width.
16607 SDValue Extract = Op.getOperand(i: 0);
16608 if (Extract.getOpcode() == ISD::ANY_EXTEND)
16609 Extract = Extract.getOperand(i: 0);
16610 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16611 return false;
16612
16613 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Val: Extract.getOperand(i: 1));
16614 if (!ExtOp)
16615 return false;
16616
16617 Index = ExtOp->getZExtValue();
16618 if (Input && Input != Extract.getOperand(i: 0))
16619 return false;
16620
16621 if (!Input)
16622 Input = Extract.getOperand(i: 0);
16623
16624 Elems = Elems << 8;
16625 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
16626 Elems |= Index;
16627
16628 return true;
16629 };
16630
16631 // If the build vector operands aren't sign extended vector extracts,
16632 // of the same input vector, then return.
16633 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16634 if (!isSExtOfVecExtract(N->getOperand(Num: i))) {
16635 return SDValue();
16636 }
16637 }
16638
16639 // If the vector extract indices are not correct, add the appropriate
16640 // vector_shuffle.
16641 int TgtElemArrayIdx;
16642 int InputSize = Input.getValueType().getScalarSizeInBits();
16643 int OutputSize = N->getValueType(ResNo: 0).getScalarSizeInBits();
16644 if (InputSize + OutputSize == 40)
16645 TgtElemArrayIdx = 0;
16646 else if (InputSize + OutputSize == 72)
16647 TgtElemArrayIdx = 1;
16648 else if (InputSize + OutputSize == 48)
16649 TgtElemArrayIdx = 2;
16650 else if (InputSize + OutputSize == 80)
16651 TgtElemArrayIdx = 3;
16652 else if (InputSize + OutputSize == 96)
16653 TgtElemArrayIdx = 4;
16654 else
16655 return SDValue();
16656
16657 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
16658 CorrectElems = DAG.getDataLayout().isLittleEndian()
16659 ? CorrectElems & 0x0F0F0F0F0F0F0F0F
16660 : CorrectElems & 0xF0F0F0F0F0F0F0F0;
16661 if (Elems != CorrectElems) {
16662 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
16663 }
16664
16665 // Regular lowering will catch cases where a shuffle is not needed.
16666 return SDValue();
16667}
16668
16669// Look for the pattern of a load from a narrow width to i128, feeding
16670// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
16671// (LXVRZX). This node represents a zero extending load that will be matched
16672// to the Load VSX Vector Rightmost instructions.
16673static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
16674 SDLoc DL(N);
16675
16676 // This combine is only eligible for a BUILD_VECTOR of v1i128.
16677 if (N->getValueType(ResNo: 0) != MVT::v1i128)
16678 return SDValue();
16679
16680 SDValue Operand = N->getOperand(Num: 0);
16681 // Proceed with the transformation if the operand to the BUILD_VECTOR
16682 // is a load instruction.
16683 if (Operand.getOpcode() != ISD::LOAD)
16684 return SDValue();
16685
16686 auto *LD = cast<LoadSDNode>(Val&: Operand);
16687 EVT MemoryType = LD->getMemoryVT();
16688
16689 // This transformation is only valid if the we are loading either a byte,
16690 // halfword, word, or doubleword.
16691 bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
16692 MemoryType == MVT::i32 || MemoryType == MVT::i64;
16693
16694 // Ensure that the load from the narrow width is being zero extended to i128.
16695 if (!ValidLDType ||
16696 (LD->getExtensionType() != ISD::ZEXTLOAD &&
16697 LD->getExtensionType() != ISD::EXTLOAD))
16698 return SDValue();
16699
16700 SDValue LoadOps[] = {
16701 LD->getChain(), LD->getBasePtr(),
16702 DAG.getIntPtrConstant(Val: MemoryType.getScalarSizeInBits(), DL)};
16703
16704 return DAG.getMemIntrinsicNode(Opcode: PPCISD::LXVRZX, dl: DL,
16705 VTList: DAG.getVTList(VT1: MVT::v1i128, VT2: MVT::Other),
16706 Ops: LoadOps, MemVT: MemoryType, MMO: LD->getMemOperand());
16707}
16708
16709SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
16710 DAGCombinerInfo &DCI) const {
16711 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16712 "Should be called with a BUILD_VECTOR node");
16713
16714 SelectionDAG &DAG = DCI.DAG;
16715 SDLoc dl(N);
16716
16717 if (!Subtarget.hasVSX())
16718 return SDValue();
16719
16720 // The target independent DAG combiner will leave a build_vector of
16721 // float-to-int conversions intact. We can generate MUCH better code for
16722 // a float-to-int conversion of a vector of floats.
16723 SDValue FirstInput = N->getOperand(Num: 0);
16724 if (FirstInput.getOpcode() == PPCISD::MFVSR) {
16725 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
16726 if (Reduced)
16727 return Reduced;
16728 }
16729
16730 // If we're building a vector out of consecutive loads, just load that
16731 // vector type.
16732 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
16733 if (Reduced)
16734 return Reduced;
16735
16736 // If we're building a vector out of extended elements from another vector
16737 // we have P9 vector integer extend instructions. The code assumes legal
16738 // input types (i.e. it can't handle things like v4i16) so do not run before
16739 // legalization.
16740 if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
16741 Reduced = combineBVOfVecSExt(N, DAG);
16742 if (Reduced)
16743 return Reduced;
16744 }
16745
16746 // On Power10, the Load VSX Vector Rightmost instructions can be utilized
16747 // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
16748 // is a load from <valid narrow width> to i128.
16749 if (Subtarget.isISA3_1()) {
16750 SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
16751 if (BVOfZLoad)
16752 return BVOfZLoad;
16753 }
16754
16755 if (N->getValueType(ResNo: 0) != MVT::v2f64)
16756 return SDValue();
16757
16758 // Looking for:
16759 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
16760 if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
16761 FirstInput.getOpcode() != ISD::UINT_TO_FP)
16762 return SDValue();
16763 if (N->getOperand(Num: 1).getOpcode() != ISD::SINT_TO_FP &&
16764 N->getOperand(Num: 1).getOpcode() != ISD::UINT_TO_FP)
16765 return SDValue();
16766 if (FirstInput.getOpcode() != N->getOperand(Num: 1).getOpcode())
16767 return SDValue();
16768
16769 SDValue Ext1 = FirstInput.getOperand(i: 0);
16770 SDValue Ext2 = N->getOperand(Num: 1).getOperand(i: 0);
16771 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16772 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16773 return SDValue();
16774
16775 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Val: Ext1.getOperand(i: 1));
16776 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Val: Ext2.getOperand(i: 1));
16777 if (!Ext1Op || !Ext2Op)
16778 return SDValue();
16779 if (Ext1.getOperand(i: 0).getValueType() != MVT::v4i32 ||
16780 Ext1.getOperand(i: 0) != Ext2.getOperand(i: 0))
16781 return SDValue();
16782
16783 int FirstElem = Ext1Op->getZExtValue();
16784 int SecondElem = Ext2Op->getZExtValue();
16785 int SubvecIdx;
16786 if (FirstElem == 0 && SecondElem == 1)
16787 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
16788 else if (FirstElem == 2 && SecondElem == 3)
16789 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
16790 else
16791 return SDValue();
16792
16793 SDValue SrcVec = Ext1.getOperand(i: 0);
16794 auto NodeType = (N->getOperand(Num: 1).getOpcode() == ISD::SINT_TO_FP) ?
16795 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
16796 return DAG.getNode(Opcode: NodeType, DL: dl, VT: MVT::v2f64,
16797 N1: SrcVec, N2: DAG.getIntPtrConstant(Val: SubvecIdx, DL: dl));
16798}
16799
16800SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
16801 DAGCombinerInfo &DCI) const {
16802 assert((N->getOpcode() == ISD::SINT_TO_FP ||
16803 N->getOpcode() == ISD::UINT_TO_FP) &&
16804 "Need an int -> FP conversion node here");
16805
16806 if (useSoftFloat() || !Subtarget.has64BitSupport())
16807 return SDValue();
16808
16809 SelectionDAG &DAG = DCI.DAG;
16810 SDLoc dl(N);
16811 SDValue Op(N, 0);
16812
16813 // Don't handle ppc_fp128 here or conversions that are out-of-range capable
16814 // from the hardware.
16815 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
16816 return SDValue();
16817 if (!Op.getOperand(i: 0).getValueType().isSimple())
16818 return SDValue();
16819 if (Op.getOperand(i: 0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
16820 Op.getOperand(i: 0).getValueType().getSimpleVT() > MVT(MVT::i64))
16821 return SDValue();
16822
16823 SDValue FirstOperand(Op.getOperand(i: 0));
16824 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
16825 (FirstOperand.getValueType() == MVT::i8 ||
16826 FirstOperand.getValueType() == MVT::i16);
16827 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
16828 bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
16829 bool DstDouble = Op.getValueType() == MVT::f64;
16830 unsigned ConvOp = Signed ?
16831 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
16832 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
16833 SDValue WidthConst =
16834 DAG.getIntPtrConstant(Val: FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
16835 DL: dl, isTarget: false);
16836 LoadSDNode *LDN = cast<LoadSDNode>(Val: FirstOperand.getNode());
16837 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
16838 SDValue Ld = DAG.getMemIntrinsicNode(Opcode: PPCISD::LXSIZX, dl,
16839 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
16840 Ops, MemVT: MVT::i8, MMO: LDN->getMemOperand());
16841 DAG.makeEquivalentMemoryOrdering(OldLoad: LDN, NewMemOp: Ld);
16842
16843 // For signed conversion, we need to sign-extend the value in the VSR
16844 if (Signed) {
16845 SDValue ExtOps[] = { Ld, WidthConst };
16846 SDValue Ext = DAG.getNode(Opcode: PPCISD::VEXTS, DL: dl, VT: MVT::f64, Ops: ExtOps);
16847 return DAG.getNode(Opcode: ConvOp, DL: dl, VT: DstDouble ? MVT::f64 : MVT::f32, Operand: Ext);
16848 } else
16849 return DAG.getNode(Opcode: ConvOp, DL: dl, VT: DstDouble ? MVT::f64 : MVT::f32, Operand: Ld);
16850 }
16851
16852
16853 // For i32 intermediate values, unfortunately, the conversion functions
16854 // leave the upper 32 bits of the value are undefined. Within the set of
16855 // scalar instructions, we have no method for zero- or sign-extending the
16856 // value. Thus, we cannot handle i32 intermediate values here.
16857 if (Op.getOperand(i: 0).getValueType() == MVT::i32)
16858 return SDValue();
16859
16860 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
16861 "UINT_TO_FP is supported only with FPCVT");
16862
16863 // If we have FCFIDS, then use it when converting to single-precision.
16864 // Otherwise, convert to double-precision and then round.
16865 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
16866 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
16867 : PPCISD::FCFIDS)
16868 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
16869 : PPCISD::FCFID);
16870 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
16871 ? MVT::f32
16872 : MVT::f64;
16873
16874 // If we're converting from a float, to an int, and back to a float again,
16875 // then we don't need the store/load pair at all.
16876 if ((Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_UINT &&
16877 Subtarget.hasFPCVT()) ||
16878 (Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_SINT)) {
16879 SDValue Src = Op.getOperand(i: 0).getOperand(i: 0);
16880 if (Src.getValueType() == MVT::f32) {
16881 Src = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Src);
16882 DCI.AddToWorklist(N: Src.getNode());
16883 } else if (Src.getValueType() != MVT::f64) {
16884 // Make sure that we don't pick up a ppc_fp128 source value.
16885 return SDValue();
16886 }
16887
16888 unsigned FCTOp =
16889 Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
16890 PPCISD::FCTIDUZ;
16891
16892 SDValue Tmp = DAG.getNode(Opcode: FCTOp, DL: dl, VT: MVT::f64, Operand: Src);
16893 SDValue FP = DAG.getNode(Opcode: FCFOp, DL: dl, VT: FCFTy, Operand: Tmp);
16894
16895 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
16896 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
16897 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
16898 DCI.AddToWorklist(N: FP.getNode());
16899 }
16900
16901 return FP;
16902 }
16903
16904 return SDValue();
16905}
16906
16907// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
16908// builtins) into loads with swaps.
16909SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
16910 DAGCombinerInfo &DCI) const {
16911 // Delay VSX load for LE combine until after LegalizeOps to prioritize other
16912 // load combines.
16913 if (DCI.isBeforeLegalizeOps())
16914 return SDValue();
16915
16916 SelectionDAG &DAG = DCI.DAG;
16917 SDLoc dl(N);
16918 SDValue Chain;
16919 SDValue Base;
16920 MachineMemOperand *MMO;
16921
16922 switch (N->getOpcode()) {
16923 default:
16924 llvm_unreachable("Unexpected opcode for little endian VSX load");
16925 case ISD::LOAD: {
16926 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
16927 Chain = LD->getChain();
16928 Base = LD->getBasePtr();
16929 MMO = LD->getMemOperand();
16930 // If the MMO suggests this isn't a load of a full vector, leave
16931 // things alone. For a built-in, we have to make the change for
16932 // correctness, so if there is a size problem that will be a bug.
16933 if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
16934 return SDValue();
16935 break;
16936 }
16937 case ISD::INTRINSIC_W_CHAIN: {
16938 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(Val: N);
16939 Chain = Intrin->getChain();
16940 // Similarly to the store case below, Intrin->getBasePtr() doesn't get
16941 // us what we want. Get operand 2 instead.
16942 Base = Intrin->getOperand(Num: 2);
16943 MMO = Intrin->getMemOperand();
16944 break;
16945 }
16946 }
16947
16948 MVT VecTy = N->getValueType(ResNo: 0).getSimpleVT();
16949
16950 SDValue LoadOps[] = { Chain, Base };
16951 SDValue Load = DAG.getMemIntrinsicNode(Opcode: PPCISD::LXVD2X, dl,
16952 VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other),
16953 Ops: LoadOps, MemVT: MVT::v2f64, MMO);
16954
16955 DCI.AddToWorklist(N: Load.getNode());
16956 Chain = Load.getValue(R: 1);
16957 SDValue Swap = DAG.getNode(
16958 Opcode: PPCISD::XXSWAPD, DL: dl, VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other), N1: Chain, N2: Load);
16959 DCI.AddToWorklist(N: Swap.getNode());
16960
16961 // Add a bitcast if the resulting load type doesn't match v2f64.
16962 if (VecTy != MVT::v2f64) {
16963 SDValue N = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecTy, Operand: Swap);
16964 DCI.AddToWorklist(N: N.getNode());
16965 // Package {bitcast value, swap's chain} to match Load's shape.
16966 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: DAG.getVTList(VT1: VecTy, VT2: MVT::Other),
16967 N1: N, N2: Swap.getValue(R: 1));
16968 }
16969
16970 return Swap;
16971}
16972
16973// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
16974// builtins) into stores with swaps.
16975SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
16976 DAGCombinerInfo &DCI) const {
16977 // Delay VSX store for LE combine until after LegalizeOps to prioritize other
16978 // store combines.
16979 if (DCI.isBeforeLegalizeOps())
16980 return SDValue();
16981
16982 SelectionDAG &DAG = DCI.DAG;
16983 SDLoc dl(N);
16984 SDValue Chain;
16985 SDValue Base;
16986 unsigned SrcOpnd;
16987 MachineMemOperand *MMO;
16988
16989 switch (N->getOpcode()) {
16990 default:
16991 llvm_unreachable("Unexpected opcode for little endian VSX store");
16992 case ISD::STORE: {
16993 StoreSDNode *ST = cast<StoreSDNode>(Val: N);
16994 Chain = ST->getChain();
16995 Base = ST->getBasePtr();
16996 MMO = ST->getMemOperand();
16997 SrcOpnd = 1;
16998 // If the MMO suggests this isn't a store of a full vector, leave
16999 // things alone. For a built-in, we have to make the change for
17000 // correctness, so if there is a size problem that will be a bug.
17001 if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
17002 return SDValue();
17003 break;
17004 }
17005 case ISD::INTRINSIC_VOID: {
17006 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(Val: N);
17007 Chain = Intrin->getChain();
17008 // Intrin->getBasePtr() oddly does not get what we want.
17009 Base = Intrin->getOperand(Num: 3);
17010 MMO = Intrin->getMemOperand();
17011 SrcOpnd = 2;
17012 break;
17013 }
17014 }
17015
17016 SDValue Src = N->getOperand(Num: SrcOpnd);
17017 MVT VecTy = Src.getValueType().getSimpleVT();
17018
17019 // All stores are done as v2f64 and possible bit cast.
17020 if (VecTy != MVT::v2f64) {
17021 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2f64, Operand: Src);
17022 DCI.AddToWorklist(N: Src.getNode());
17023 }
17024
17025 SDValue Swap = DAG.getNode(Opcode: PPCISD::XXSWAPD, DL: dl,
17026 VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other), N1: Chain, N2: Src);
17027 DCI.AddToWorklist(N: Swap.getNode());
17028 Chain = Swap.getValue(R: 1);
17029 SDValue StoreOps[] = { Chain, Swap, Base };
17030 SDValue Store = DAG.getMemIntrinsicNode(Opcode: PPCISD::STXVD2X, dl,
17031 VTList: DAG.getVTList(VT: MVT::Other),
17032 Ops: StoreOps, MemVT: VecTy, MMO);
17033 DCI.AddToWorklist(N: Store.getNode());
17034 return Store;
17035}
17036
17037// Handle DAG combine for STORE (FP_TO_INT F).
17038SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
17039 DAGCombinerInfo &DCI) const {
17040 SelectionDAG &DAG = DCI.DAG;
17041 SDLoc dl(N);
17042 unsigned Opcode = N->getOperand(Num: 1).getOpcode();
17043 (void)Opcode;
17044 bool Strict = N->getOperand(Num: 1)->isStrictFPOpcode();
17045
17046 assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
17047 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT)
17048 && "Not a FP_TO_INT Instruction!");
17049
17050 SDValue Val = N->getOperand(Num: 1).getOperand(i: Strict ? 1 : 0);
17051 EVT Op1VT = N->getOperand(Num: 1).getValueType();
17052 EVT ResVT = Val.getValueType();
17053
17054 if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(VT: ResVT))
17055 return SDValue();
17056
17057 // Only perform combine for conversion to i64/i32 or power9 i16/i8.
17058 bool ValidTypeForStoreFltAsInt =
17059 (Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) ||
17060 (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
17061
17062 // TODO: Lower conversion from f128 on all VSX targets
17063 if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector()))
17064 return SDValue();
17065
17066 if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) ||
17067 cast<StoreSDNode>(Val: N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
17068 return SDValue();
17069
17070 Val = convertFPToInt(Op: N->getOperand(Num: 1), DAG, Subtarget);
17071
17072 // Set number of bytes being converted.
17073 unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
17074 SDValue Ops[] = {N->getOperand(Num: 0), Val, N->getOperand(Num: 2),
17075 DAG.getIntPtrConstant(Val: ByteSize, DL: dl, isTarget: false),
17076 DAG.getValueType(Op1VT)};
17077
17078 Val = DAG.getMemIntrinsicNode(Opcode: PPCISD::ST_VSR_SCAL_INT, dl,
17079 VTList: DAG.getVTList(VT: MVT::Other), Ops,
17080 MemVT: cast<StoreSDNode>(Val: N)->getMemoryVT(),
17081 MMO: cast<StoreSDNode>(Val: N)->getMemOperand());
17082
17083 return Val;
17084}
17085
17086static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
17087 // Check that the source of the element keeps flipping
17088 // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
17089 bool PrevElemFromFirstVec = Mask[0] < NumElts;
17090 for (int i = 1, e = Mask.size(); i < e; i++) {
17091 if (PrevElemFromFirstVec && Mask[i] < NumElts)
17092 return false;
17093 if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
17094 return false;
17095 PrevElemFromFirstVec = !PrevElemFromFirstVec;
17096 }
17097 return true;
17098}
17099
17100static bool isSplatBV(SDValue Op) {
17101 if (Op.getOpcode() != ISD::BUILD_VECTOR)
17102 return false;
17103 SDValue FirstOp;
17104
17105 // Find first non-undef input.
17106 for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
17107 FirstOp = Op.getOperand(i);
17108 if (!FirstOp.isUndef())
17109 break;
17110 }
17111
17112 // All inputs are undef or the same as the first non-undef input.
17113 for (int i = 1, e = Op.getNumOperands(); i < e; i++)
17114 if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
17115 return false;
17116 return true;
17117}
17118
17119static SDValue isScalarToVec(SDValue Op) {
17120 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
17121 return Op;
17122 if (Op.getOpcode() != ISD::BITCAST)
17123 return SDValue();
17124 Op = Op.getOperand(i: 0);
17125 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
17126 return Op;
17127 return SDValue();
17128}
17129
17130// Fix up the shuffle mask to account for the fact that the result of
17131// scalar_to_vector is not in lane zero. This just takes all values in
17132// the ranges specified by the min/max indices and adds the number of
17133// elements required to ensure each element comes from the respective
17134// position in the valid lane.
17135// On little endian, that's just the corresponding element in the other
17136// half of the vector. On big endian, it is in the same half but right
17137// justified rather than left justified in that half.
17138static void fixupShuffleMaskForPermutedSToV(
17139 SmallVectorImpl<int> &ShuffV, int LHSFirstElt, int LHSLastElt,
17140 int RHSFirstElt, int RHSLastElt, int HalfVec, unsigned LHSNumValidElts,
17141 unsigned RHSNumValidElts, const PPCSubtarget &Subtarget) {
17142 int LHSEltFixup =
17143 Subtarget.isLittleEndian() ? HalfVec : HalfVec - LHSNumValidElts;
17144 int RHSEltFixup =
17145 Subtarget.isLittleEndian() ? HalfVec : HalfVec - RHSNumValidElts;
17146 for (int I = 0, E = ShuffV.size(); I < E; ++I) {
17147 int Idx = ShuffV[I];
17148 if (Idx >= LHSFirstElt && Idx <= LHSLastElt)
17149 ShuffV[I] += LHSEltFixup;
17150 else if (Idx >= RHSFirstElt && Idx <= RHSLastElt)
17151 ShuffV[I] += RHSEltFixup;
17152 }
17153}
17154
17155// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
17156// the original is:
17157// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
17158// In such a case, just change the shuffle mask to extract the element
17159// from the permuted index.
17160static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
17161 const PPCSubtarget &Subtarget) {
17162 SDLoc dl(OrigSToV);
17163 EVT VT = OrigSToV.getValueType();
17164 assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
17165 "Expecting a SCALAR_TO_VECTOR here");
17166 SDValue Input = OrigSToV.getOperand(i: 0);
17167
17168 if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
17169 ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: Input.getOperand(i: 1));
17170 SDValue OrigVector = Input.getOperand(i: 0);
17171
17172 // Can't handle non-const element indices or different vector types
17173 // for the input to the extract and the output of the scalar_to_vector.
17174 if (Idx && VT == OrigVector.getValueType()) {
17175 unsigned NumElts = VT.getVectorNumElements();
17176 assert(
17177 NumElts > 1 &&
17178 "Cannot produce a permuted scalar_to_vector for one element vector");
17179 SmallVector<int, 16> NewMask(NumElts, -1);
17180 unsigned ResultInElt = NumElts / 2;
17181 ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
17182 NewMask[ResultInElt] = Idx->getZExtValue();
17183 return DAG.getVectorShuffle(VT, dl, N1: OrigVector, N2: OrigVector, Mask: NewMask);
17184 }
17185 }
17186 return DAG.getNode(Opcode: PPCISD::SCALAR_TO_VECTOR_PERMUTED, DL: dl, VT,
17187 Operand: OrigSToV.getOperand(i: 0));
17188}
17189
17190static bool isShuffleMaskInRange(const SmallVectorImpl<int> &ShuffV,
17191 int HalfVec, int LHSLastElementDefined,
17192 int RHSLastElementDefined) {
17193 for (int Index : ShuffV) {
17194 if (Index < 0) // Skip explicitly undefined mask indices.
17195 continue;
17196 // Handle first input vector of the vector_shuffle.
17197 if ((LHSLastElementDefined >= 0) && (Index < HalfVec) &&
17198 (Index > LHSLastElementDefined))
17199 return false;
17200 // Handle second input vector of the vector_shuffle.
17201 if ((RHSLastElementDefined >= 0) &&
17202 (Index > HalfVec + RHSLastElementDefined))
17203 return false;
17204 }
17205 return true;
17206}
17207
17208static SDValue generateSToVPermutedForVecShuffle(
17209 int ScalarSize, uint64_t ShuffleEltWidth, unsigned &NumValidElts,
17210 int FirstElt, int &LastElt, SDValue VecShuffOperand, SDValue SToVNode,
17211 SelectionDAG &DAG, const PPCSubtarget &Subtarget) {
17212 EVT VecShuffOperandType = VecShuffOperand.getValueType();
17213 // Set up the values for the shuffle vector fixup.
17214 NumValidElts = ScalarSize / VecShuffOperandType.getScalarSizeInBits();
17215 // The last element depends on if the input comes from the LHS or RHS.
17216 //
17217 // For example:
17218 // (shuff (s_to_v i32), (bitcast (s_to_v i64), v4i32), ...)
17219 //
17220 // For the LHS: The last element that comes from the LHS is actually 0, not 3
17221 // because elements 1 and higher of a scalar_to_vector are undefined.
17222 // For the RHS: The last element that comes from the RHS is actually 5, not 7
17223 // because elements 1 and higher of a scalar_to_vector are undefined.
17224 // It is also not 4 because the original scalar_to_vector is wider and
17225 // actually contains two i32 elements.
17226 LastElt = (uint64_t)ScalarSize > ShuffleEltWidth
17227 ? ScalarSize / ShuffleEltWidth - 1 + FirstElt
17228 : FirstElt;
17229 SDValue SToVPermuted = getSToVPermuted(OrigSToV: SToVNode, DAG, Subtarget);
17230 if (SToVPermuted.getValueType() != VecShuffOperandType)
17231 SToVPermuted = DAG.getBitcast(VT: VecShuffOperandType, V: SToVPermuted);
17232 return SToVPermuted;
17233}
17234
17235// On little endian subtargets, combine shuffles such as:
17236// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
17237// into:
17238// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
17239// because the latter can be matched to a single instruction merge.
17240// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
17241// to put the value into element zero. Adjust the shuffle mask so that the
17242// vector can remain in permuted form (to prevent a swap prior to a shuffle).
17243// On big endian targets, this is still useful for SCALAR_TO_VECTOR
17244// nodes with elements smaller than doubleword because all the ways
17245// of getting scalar data into a vector register put the value in the
17246// rightmost element of the left half of the vector.
17247SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
17248 SelectionDAG &DAG) const {
17249 SDValue LHS = SVN->getOperand(Num: 0);
17250 SDValue RHS = SVN->getOperand(Num: 1);
17251 auto Mask = SVN->getMask();
17252 int NumElts = LHS.getValueType().getVectorNumElements();
17253 SDValue Res(SVN, 0);
17254 SDLoc dl(SVN);
17255 bool IsLittleEndian = Subtarget.isLittleEndian();
17256
17257 // On big endian targets this is only useful for subtargets with direct moves.
17258 // On little endian targets it would be useful for all subtargets with VSX.
17259 // However adding special handling for LE subtargets without direct moves
17260 // would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)
17261 // which includes direct moves.
17262 if (!Subtarget.hasDirectMove())
17263 return Res;
17264
17265 // If this is not a shuffle of a shuffle and the first element comes from
17266 // the second vector, canonicalize to the commuted form. This will make it
17267 // more likely to match one of the single instruction patterns.
17268 if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
17269 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
17270 std::swap(a&: LHS, b&: RHS);
17271 Res = DAG.getCommutedVectorShuffle(SV: *SVN);
17272
17273 if (!isa<ShuffleVectorSDNode>(Val: Res))
17274 return Res;
17275
17276 Mask = cast<ShuffleVectorSDNode>(Val&: Res)->getMask();
17277 }
17278
17279 // Adjust the shuffle mask if either input vector comes from a
17280 // SCALAR_TO_VECTOR and keep the respective input vector in permuted
17281 // form (to prevent the need for a swap).
17282 SmallVector<int, 16> ShuffV(Mask);
17283 SDValue SToVLHS = isScalarToVec(Op: LHS);
17284 SDValue SToVRHS = isScalarToVec(Op: RHS);
17285 if (SToVLHS || SToVRHS) {
17286 EVT VT = SVN->getValueType(ResNo: 0);
17287 uint64_t ShuffleEltWidth = VT.getVectorElementType().getSizeInBits();
17288 int ShuffleNumElts = ShuffV.size();
17289 int HalfVec = ShuffleNumElts / 2;
17290 // The width of the "valid lane" (i.e. the lane that contains the value that
17291 // is vectorized) needs to be expressed in terms of the number of elements
17292 // of the shuffle. It is thereby the ratio of the values before and after
17293 // any bitcast, which will be set later on if the LHS or RHS are
17294 // SCALAR_TO_VECTOR nodes.
17295 unsigned LHSNumValidElts = HalfVec;
17296 unsigned RHSNumValidElts = HalfVec;
17297
17298 // Initially assume that neither input is permuted. These will be adjusted
17299 // accordingly if either input is. Note, that -1 means that all elements
17300 // are undefined.
17301 int LHSFirstElt = 0;
17302 int RHSFirstElt = ShuffleNumElts;
17303 int LHSLastElt = -1;
17304 int RHSLastElt = -1;
17305
17306 // Get the permuted scalar to vector nodes for the source(s) that come from
17307 // ISD::SCALAR_TO_VECTOR.
17308 // On big endian systems, this only makes sense for element sizes smaller
17309 // than 64 bits since for 64-bit elements, all instructions already put
17310 // the value into element zero. Since scalar size of LHS and RHS may differ
17311 // after isScalarToVec, this should be checked using their own sizes.
17312 int LHSScalarSize = 0;
17313 int RHSScalarSize = 0;
17314 if (SToVLHS) {
17315 LHSScalarSize = SToVLHS.getValueType().getScalarSizeInBits();
17316 if (!IsLittleEndian && LHSScalarSize >= 64)
17317 return Res;
17318 }
17319 if (SToVRHS) {
17320 RHSScalarSize = SToVRHS.getValueType().getScalarSizeInBits();
17321 if (!IsLittleEndian && RHSScalarSize >= 64)
17322 return Res;
17323 }
17324 if (LHSScalarSize != 0)
17325 LHS = generateSToVPermutedForVecShuffle(
17326 ScalarSize: LHSScalarSize, ShuffleEltWidth, NumValidElts&: LHSNumValidElts, FirstElt: LHSFirstElt,
17327 LastElt&: LHSLastElt, VecShuffOperand: LHS, SToVNode: SToVLHS, DAG, Subtarget);
17328 if (RHSScalarSize != 0)
17329 RHS = generateSToVPermutedForVecShuffle(
17330 ScalarSize: RHSScalarSize, ShuffleEltWidth, NumValidElts&: RHSNumValidElts, FirstElt: RHSFirstElt,
17331 LastElt&: RHSLastElt, VecShuffOperand: RHS, SToVNode: SToVRHS, DAG, Subtarget);
17332
17333 if (!isShuffleMaskInRange(ShuffV, HalfVec, LHSLastElementDefined: LHSLastElt, RHSLastElementDefined: RHSLastElt))
17334 return Res;
17335
17336 // Fix up the shuffle mask to reflect where the desired element actually is.
17337 // The minimum and maximum indices that correspond to element zero for both
17338 // the LHS and RHS are computed and will control which shuffle mask entries
17339 // are to be changed. For example, if the RHS is permuted, any shuffle mask
17340 // entries in the range [RHSFirstElt,RHSLastElt] will be adjusted.
17341 fixupShuffleMaskForPermutedSToV(
17342 ShuffV, LHSFirstElt, LHSLastElt, RHSFirstElt, RHSLastElt, HalfVec,
17343 LHSNumValidElts, RHSNumValidElts, Subtarget);
17344 Res = DAG.getVectorShuffle(VT: SVN->getValueType(ResNo: 0), dl, N1: LHS, N2: RHS, Mask: ShuffV);
17345
17346 // We may have simplified away the shuffle. We won't be able to do anything
17347 // further with it here.
17348 if (!isa<ShuffleVectorSDNode>(Val: Res))
17349 return Res;
17350 Mask = cast<ShuffleVectorSDNode>(Val&: Res)->getMask();
17351 }
17352
17353 SDValue TheSplat = IsLittleEndian ? RHS : LHS;
17354 // The common case after we commuted the shuffle is that the RHS is a splat
17355 // and we have elements coming in from the splat at indices that are not
17356 // conducive to using a merge.
17357 // Example:
17358 // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
17359 if (!isSplatBV(Op: TheSplat))
17360 return Res;
17361
17362 // We are looking for a mask such that all even elements are from
17363 // one vector and all odd elements from the other.
17364 if (!isAlternatingShuffMask(Mask, NumElts))
17365 return Res;
17366
17367 // Adjust the mask so we are pulling in the same index from the splat
17368 // as the index from the interesting vector in consecutive elements.
17369 if (IsLittleEndian) {
17370 // Example (even elements from first vector):
17371 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
17372 if (Mask[0] < NumElts)
17373 for (int i = 1, e = Mask.size(); i < e; i += 2) {
17374 if (ShuffV[i] < 0)
17375 continue;
17376 // If element from non-splat is undef, pick first element from splat.
17377 ShuffV[i] = (ShuffV[i - 1] >= 0 ? ShuffV[i - 1] : 0) + NumElts;
17378 }
17379 // Example (odd elements from first vector):
17380 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
17381 else
17382 for (int i = 0, e = Mask.size(); i < e; i += 2) {
17383 if (ShuffV[i] < 0)
17384 continue;
17385 // If element from non-splat is undef, pick first element from splat.
17386 ShuffV[i] = (ShuffV[i + 1] >= 0 ? ShuffV[i + 1] : 0) + NumElts;
17387 }
17388 } else {
17389 // Example (even elements from first vector):
17390 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
17391 if (Mask[0] < NumElts)
17392 for (int i = 0, e = Mask.size(); i < e; i += 2) {
17393 if (ShuffV[i] < 0)
17394 continue;
17395 // If element from non-splat is undef, pick first element from splat.
17396 ShuffV[i] = ShuffV[i + 1] >= 0 ? ShuffV[i + 1] - NumElts : 0;
17397 }
17398 // Example (odd elements from first vector):
17399 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
17400 else
17401 for (int i = 1, e = Mask.size(); i < e; i += 2) {
17402 if (ShuffV[i] < 0)
17403 continue;
17404 // If element from non-splat is undef, pick first element from splat.
17405 ShuffV[i] = ShuffV[i - 1] >= 0 ? ShuffV[i - 1] - NumElts : 0;
17406 }
17407 }
17408
17409 // If the RHS has undefs, we need to remove them since we may have created
17410 // a shuffle that adds those instead of the splat value.
17411 SDValue SplatVal =
17412 cast<BuildVectorSDNode>(Val: TheSplat.getNode())->getSplatValue();
17413 TheSplat = DAG.getSplatBuildVector(VT: TheSplat.getValueType(), DL: dl, Op: SplatVal);
17414
17415 if (IsLittleEndian)
17416 RHS = TheSplat;
17417 else
17418 LHS = TheSplat;
17419 return DAG.getVectorShuffle(VT: SVN->getValueType(ResNo: 0), dl, N1: LHS, N2: RHS, Mask: ShuffV);
17420}
17421
17422SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
17423 LSBaseSDNode *LSBase,
17424 DAGCombinerInfo &DCI) const {
17425 assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
17426 "Not a reverse memop pattern!");
17427
17428 auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
17429 auto Mask = SVN->getMask();
17430 int i = 0;
17431 auto I = Mask.rbegin();
17432 auto E = Mask.rend();
17433
17434 for (; I != E; ++I) {
17435 if (*I != i)
17436 return false;
17437 i++;
17438 }
17439 return true;
17440 };
17441
17442 SelectionDAG &DAG = DCI.DAG;
17443 EVT VT = SVN->getValueType(ResNo: 0);
17444
17445 if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
17446 return SDValue();
17447
17448 // Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
17449 // See comment in PPCVSXSwapRemoval.cpp.
17450 // It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
17451 if (!Subtarget.hasP9Vector())
17452 return SDValue();
17453
17454 if(!IsElementReverse(SVN))
17455 return SDValue();
17456
17457 if (LSBase->getOpcode() == ISD::LOAD) {
17458 // If the load return value 0 has more than one user except the
17459 // shufflevector instruction, it is not profitable to replace the
17460 // shufflevector with a reverse load.
17461 for (SDUse &Use : LSBase->uses())
17462 if (Use.getResNo() == 0 &&
17463 Use.getUser()->getOpcode() != ISD::VECTOR_SHUFFLE)
17464 return SDValue();
17465
17466 SDLoc dl(LSBase);
17467 SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
17468 return DAG.getMemIntrinsicNode(
17469 Opcode: PPCISD::LOAD_VEC_BE, dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Other), Ops: LoadOps,
17470 MemVT: LSBase->getMemoryVT(), MMO: LSBase->getMemOperand());
17471 }
17472
17473 if (LSBase->getOpcode() == ISD::STORE) {
17474 // If there are other uses of the shuffle, the swap cannot be avoided.
17475 // Forcing the use of an X-Form (since swapped stores only have
17476 // X-Forms) without removing the swap is unprofitable.
17477 if (!SVN->hasOneUse())
17478 return SDValue();
17479
17480 SDLoc dl(LSBase);
17481 SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(Num: 0),
17482 LSBase->getBasePtr()};
17483 return DAG.getMemIntrinsicNode(
17484 Opcode: PPCISD::STORE_VEC_BE, dl, VTList: DAG.getVTList(VT: MVT::Other), Ops: StoreOps,
17485 MemVT: LSBase->getMemoryVT(), MMO: LSBase->getMemOperand());
17486 }
17487
17488 llvm_unreachable("Expected a load or store node here");
17489}
17490
17491static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
17492 unsigned IntrinsicID = Intrin.getConstantOperandVal(i: 1);
17493 if (IntrinsicID == Intrinsic::ppc_stdcx)
17494 StoreWidth = 8;
17495 else if (IntrinsicID == Intrinsic::ppc_stwcx)
17496 StoreWidth = 4;
17497 else if (IntrinsicID == Intrinsic::ppc_sthcx)
17498 StoreWidth = 2;
17499 else if (IntrinsicID == Intrinsic::ppc_stbcx)
17500 StoreWidth = 1;
17501 else
17502 return false;
17503 return true;
17504}
17505
17506static SDValue DAGCombineAddc(SDNode *N,
17507 llvm::PPCTargetLowering::DAGCombinerInfo &DCI) {
17508 if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(Value: 1)) {
17509 // (ADDC (ADDE 0, 0, C), -1) -> C
17510 SDValue LHS = N->getOperand(Num: 0);
17511 SDValue RHS = N->getOperand(Num: 1);
17512 if (LHS->getOpcode() == PPCISD::ADDE &&
17513 isNullConstant(V: LHS->getOperand(Num: 0)) &&
17514 isNullConstant(V: LHS->getOperand(Num: 1)) && isAllOnesConstant(V: RHS)) {
17515 return DCI.CombineTo(N, Res0: SDValue(N, 0), Res1: LHS->getOperand(Num: 2));
17516 }
17517 }
17518 return SDValue();
17519}
17520
17521/// Optimize the bitfloor(X) pattern for PowerPC.
17522/// Transforms: select_cc X, 0, 0, (srl MinSignedValue, (ctlz X)), seteq
17523/// Into: srl MinSignedValue, (ctlz X)
17524///
17525/// This is safe on PowerPC because the srw instruction returns 0 when the
17526/// shift amount is == bitwidth, which matches the behavior we need for X=0.
17527static SDValue combineSELECT_CCBitFloor(SDNode *N, SelectionDAG &DAG) {
17528 if (N->getOpcode() != ISD::SELECT_CC)
17529 return SDValue();
17530
17531 // SELECT_CC operands: LHS, RHS, TrueVal, FalseVal, CC
17532 SDValue CmpLHS = N->getOperand(Num: 0);
17533 SDValue CmpRHS = N->getOperand(Num: 1);
17534 SDValue TrueVal = N->getOperand(Num: 2);
17535 SDValue FalseVal = N->getOperand(Num: 3);
17536 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 4))->get();
17537
17538 // Check if condition is (X == 0)
17539 if (CC != ISD::SETEQ || !isNullConstant(V: CmpRHS))
17540 return SDValue();
17541
17542 // Check if TrueVal is constant 0
17543 if (!isNullConstant(V: TrueVal))
17544 return SDValue();
17545
17546 // This combine is replacing a select_cc with a PPC srl, not an srl with a
17547 // PPC srl. If the original srl had multiple uses it would just remain in the
17548 // code. This is at most a performance consideration.
17549 if (FalseVal.getOpcode() != ISD::SRL || !FalseVal.hasOneUse())
17550 return SDValue();
17551
17552 SDValue ShiftVal = FalseVal.getOperand(i: 0);
17553 SDValue ShiftAmt = FalseVal.getOperand(i: 1);
17554
17555 // Check if ShiftVal is MinSignedValue
17556 auto *ShiftConst = dyn_cast<ConstantSDNode>(Val&: ShiftVal);
17557 if (!ShiftConst || !ShiftConst->getAPIntValue().isMinSignedValue())
17558 return SDValue();
17559
17560 SDValue CtlzArg;
17561 // Check if ShiftAmt is (ctlz CmpLHS) or (truncate (ctlz ...))
17562 if (ShiftAmt.getOpcode() != ISD::CTLZ) {
17563 // Look through truncate if present (for i64 ctlz truncated to i32 shift
17564 // amount)
17565 if (ShiftAmt.getOpcode() != ISD::TRUNCATE)
17566 return SDValue();
17567
17568 // Verify the truncate target type is appropriate for shift amount (i32, not
17569 // i1 or other)
17570 if (ShiftAmt.getValueType() != MVT::i32)
17571 return SDValue();
17572
17573 SDValue CtlzNode = ShiftAmt.getOperand(i: 0);
17574
17575 if (CtlzNode.getOpcode() != ISD::CTLZ)
17576 return SDValue();
17577
17578 CtlzArg = CtlzNode.getOperand(i: 0);
17579 } else {
17580 CtlzArg = ShiftAmt.getOperand(i: 0);
17581 }
17582
17583 // Check if ctlz operates on the same value as the comparison
17584 if (CtlzArg != CmpLHS)
17585 return SDValue();
17586
17587 // Using PPCISD::SRL to ensure well-defined behavior.
17588 // On PowerPC, PPCISD::SRL guarantees that shift by bitwidth returns 0,
17589 // which is exactly what we need for the bitfloor(0) case.
17590 SDLoc DL(N);
17591 SDValue PPCSrl =
17592 DAG.getNode(Opcode: PPCISD::SRL, DL, VT: FalseVal.getValueType(), N1: ShiftVal, N2: ShiftAmt);
17593 return PPCSrl;
17594}
17595
17596// Optimize zero-extension of setcc when the compared value is known to be 0
17597// or 1.
17598//
17599// Pattern: zext(setcc(Value, 0, seteq/setne)) where Value is 0 or 1
17600// -> zext(xor(Value, 1)) for seteq
17601// -> zext(Value) for setne
17602//
17603// This optimization avoids the i32 -> i1 -> i32/i64 conversion sequence
17604// by keeping the value in its original i32 type throughout.
17605//
17606// Example:
17607// Before: zext(setcc(test_data_class(...), 0, seteq))
17608// // test_data_class returns 0 or 1 in i32
17609// // setcc converts i32 -> i1
17610// // zext converts i1 -> i64
17611// After: zext(xor(test_data_class(...), 1))
17612// // Stays in i32, then extends to i64
17613//
17614// This is beneficial because:
17615// 1. Eliminates the setcc instruction
17616// 2. Avoids i32 -> i1 truncation
17617// 3. Keeps computation in native integer width
17618
17619static SDValue combineZextSetccWithZero(SDNode *N, SelectionDAG &DAG) {
17620 // Check if this is a zero_extend
17621 if (N->getOpcode() != ISD::ZERO_EXTEND)
17622 return SDValue();
17623
17624 SDValue Src = N->getOperand(Num: 0);
17625
17626 // Check if the source is a setcc
17627 if (Src.getOpcode() != ISD::SETCC)
17628 return SDValue();
17629
17630 SDValue LHS = Src.getOperand(i: 0);
17631 SDValue RHS = Src.getOperand(i: 1);
17632 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Src.getOperand(i: 2))->get();
17633
17634 if (!isNullConstant(V: RHS) && !isNullConstant(V: LHS))
17635 return SDValue();
17636
17637 SDValue NonNullConstant = isNullConstant(V: RHS) ? LHS : RHS;
17638
17639 auto isZeroOrOne = [=](SDValue &V) {
17640 if (V.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17641 V.getConstantOperandVal(i: 0) == Intrinsic::ppc_test_data_class)
17642 return true;
17643 return false;
17644 };
17645
17646 if (!isZeroOrOne(NonNullConstant))
17647 return SDValue();
17648
17649 // Check for pattern: zext(setcc (Value), 0, seteq)) or
17650 // zext(setcc (Value), 0, setne))
17651 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
17652 // Replace with: zext(xor(Value, 1)) for seteq
17653 // or: zext(Value) for setne
17654 // This keeps the value in i32 instead of converting to i1
17655 SDLoc DL(N);
17656 EVT VType = N->getValueType(ResNo: 0);
17657 SDValue NewNonNullConstant = DAG.getZExtOrTrunc(Op: NonNullConstant, DL, VT: VType);
17658
17659 if (CC == ISD::SETNE)
17660 return NewNonNullConstant;
17661
17662 SDValue One = DAG.getConstant(Val: 1, DL, VT: VType);
17663 return DAG.getNode(Opcode: ISD::XOR, DL, VT: VType, N1: NewNonNullConstant, N2: One);
17664 }
17665
17666 return SDValue();
17667}
17668
17669// Combine XOR patterns with SELECT_CC_I4/I8, for Example:
17670// 1. XOR(SELECT_CC_I4(cond, 1, 0, cc), 1) -> SELECT_CC_I4(cond, 0, 1, cc)
17671// 2. XOR(ZEXT(SELECT_CC_I4(cond, 1, 0, cc)), 1) -> SELECT_CC_I4/I8(cond, 0,
17672// 1, cc))
17673// 3. XOR(ANYEXT(SELECT_CC_I4(cond, 1, 0, cc)), 1) -> SELECT_CC_I4/I8(cond,
17674// 0, 1, cc))
17675// 4. etc
17676static SDValue combineXorSelectCC(SDNode *N, SelectionDAG &DAG) {
17677 assert(N->getOpcode() == ISD::XOR && "Expected XOR node");
17678
17679 EVT XorVT = N->getValueType(ResNo: 0);
17680 if ((XorVT != MVT::i32 && XorVT != MVT::i64))
17681 return SDValue();
17682
17683 SDValue LHS = N->getOperand(Num: 0);
17684 SDValue RHS = N->getOperand(Num: 1);
17685
17686 // Check for XOR with constant 1
17687 ConstantSDNode *XorConst = dyn_cast<ConstantSDNode>(Val&: RHS);
17688 if (!XorConst || !XorConst->isOne()) {
17689 XorConst = dyn_cast<ConstantSDNode>(Val&: LHS);
17690 if (!XorConst || !XorConst->isOne())
17691 return SDValue();
17692 // Swap so LHS is the SELECT_CC_I4 (or extension) and RHS is the constant
17693 std::swap(a&: LHS, b&: RHS);
17694 }
17695
17696 // Check if LHS has only one use
17697 if (!LHS.hasOneUse())
17698 return SDValue();
17699
17700 // Handle extensions: ZEXT, ANYEXT
17701 SDValue SelectNode = LHS;
17702
17703 if (LHS.getOpcode() == ISD::ZERO_EXTEND ||
17704 LHS.getOpcode() == ISD::ANY_EXTEND) {
17705 SelectNode = LHS.getOperand(i: 0);
17706
17707 // Check if the extension input has only one use
17708 if (!SelectNode.hasOneUse())
17709 return SDValue();
17710 }
17711
17712 // Check if SelectNode is a MachineSDNode with SELECT_CC_I4/I8 opcode
17713 if (!SelectNode.isMachineOpcode())
17714 return SDValue();
17715
17716 unsigned MachineOpc = SelectNode.getMachineOpcode();
17717
17718 // Handle both SELECT_CC_I4 and SELECT_CC_I8
17719 if (MachineOpc != PPC::SELECT_CC_I4 && MachineOpc != PPC::SELECT_CC_I8)
17720 return SDValue();
17721
17722 // SELECT_CC_I4 operands: (cond, true_val, false_val, bropc)
17723 if (SelectNode.getNumOperands() != 4)
17724 return SDValue();
17725
17726 ConstantSDNode *ConstOp1 = dyn_cast<ConstantSDNode>(Val: SelectNode.getOperand(i: 1));
17727 ConstantSDNode *ConstOp2 = dyn_cast<ConstantSDNode>(Val: SelectNode.getOperand(i: 2));
17728
17729 if (!ConstOp1 || !ConstOp2)
17730 return SDValue();
17731
17732 // Only optimize if operands are {0, 1} or {1, 0}
17733 if (!((ConstOp1->isOne() && ConstOp2->isZero()) ||
17734 (ConstOp1->isZero() && ConstOp2->isOne())))
17735 return SDValue();
17736
17737 // Pattern matched! Create new SELECT_CC with swapped 0/1 operands to
17738 // eliminate XOR. If original was SELECT_CC(cond, 1, 0, pred), create
17739 // SELECT_CC(cond, 0, 1, pred). If original was SELECT_CC(cond, 0, 1, pred),
17740 // create SELECT_CC(cond, 1, 0, pred).
17741 SDLoc DL(N);
17742 MachineOpc = (XorVT == MVT::i32) ? PPC::SELECT_CC_I4 : PPC::SELECT_CC_I8;
17743
17744 bool ConstOp1IsOne = ConstOp1->isOne();
17745 return SDValue(
17746 DAG.getMachineNode(Opcode: MachineOpc, dl: DL, VT: XorVT,
17747 Ops: {SelectNode.getOperand(i: 0),
17748 DAG.getConstant(Val: ConstOp1IsOne ? 0 : 1, DL, VT: XorVT),
17749 DAG.getConstant(Val: ConstOp1IsOne ? 1 : 0, DL, VT: XorVT),
17750 SelectNode.getOperand(i: 3)}),
17751 0);
17752}
17753
17754SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
17755 DAGCombinerInfo &DCI) const {
17756 SelectionDAG &DAG = DCI.DAG;
17757 SDLoc dl(N);
17758 switch (N->getOpcode()) {
17759 default: break;
17760 case ISD::ADD:
17761 return combineADD(N, DCI);
17762 case ISD::AND: {
17763 // We don't want (and (zext (shift...)), C) if C fits in the width of the
17764 // original input as that will prevent us from selecting optimal rotates.
17765 // This only matters if the input to the extend is i32 widened to i64.
17766 SDValue Op1 = N->getOperand(Num: 0);
17767 SDValue Op2 = N->getOperand(Num: 1);
17768 if ((Op1.getOpcode() != ISD::ZERO_EXTEND &&
17769 Op1.getOpcode() != ISD::ANY_EXTEND) ||
17770 !isa<ConstantSDNode>(Val: Op2) || N->getValueType(ResNo: 0) != MVT::i64 ||
17771 Op1.getOperand(i: 0).getValueType() != MVT::i32)
17772 break;
17773 SDValue NarrowOp = Op1.getOperand(i: 0);
17774 if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL &&
17775 NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)
17776 break;
17777
17778 uint64_t Imm = Op2->getAsZExtVal();
17779 // Make sure that the constant is narrow enough to fit in the narrow type.
17780 if (!isUInt<32>(x: Imm))
17781 break;
17782 SDValue ConstOp = DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32);
17783 SDValue NarrowAnd = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: NarrowOp, N2: ConstOp);
17784 return DAG.getZExtOrTrunc(Op: NarrowAnd, DL: dl, VT: N->getValueType(ResNo: 0));
17785 }
17786 case ISD::XOR: {
17787 // Optimize XOR(ISEL(1,0,CR), 1) -> ISEL(0,1,CR)
17788 if (SDValue V = combineXorSelectCC(N, DAG))
17789 return V;
17790 break;
17791 }
17792 case ISD::SHL:
17793 return combineSHL(N, DCI);
17794 case ISD::SRA:
17795 return combineSRA(N, DCI);
17796 case ISD::SRL:
17797 return combineSRL(N, DCI);
17798 case ISD::MUL:
17799 return combineMUL(N, DCI);
17800 case ISD::FMA:
17801 case PPCISD::FNMSUB:
17802 return combineFMALike(N, DCI);
17803 case PPCISD::SHL:
17804 if (isNullConstant(V: N->getOperand(Num: 0))) // 0 << V -> 0.
17805 return N->getOperand(Num: 0);
17806 break;
17807 case PPCISD::SRL:
17808 if (isNullConstant(V: N->getOperand(Num: 0))) // 0 >>u V -> 0.
17809 return N->getOperand(Num: 0);
17810 break;
17811 case PPCISD::SRA:
17812 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0))) {
17813 if (C->isZero() || // 0 >>s V -> 0.
17814 C->isAllOnes()) // -1 >>s V -> -1.
17815 return N->getOperand(Num: 0);
17816 }
17817 break;
17818 case ISD::SIGN_EXTEND:
17819 if (SDValue SECC = combineSignExtendSetCC(N, DCI))
17820 return SECC;
17821 [[fallthrough]];
17822 case ISD::ZERO_EXTEND:
17823 if (SDValue RetV = combineZextSetccWithZero(N, DAG&: DCI.DAG))
17824 return RetV;
17825 [[fallthrough]];
17826 case ISD::ANY_EXTEND:
17827 return DAGCombineExtBoolTrunc(N, DCI);
17828 case ISD::TRUNCATE:
17829 return combineTRUNCATE(N, DCI);
17830 case ISD::SETCC:
17831 if (SDValue CSCC = combineSetCC(N, DCI))
17832 return CSCC;
17833 [[fallthrough]];
17834 case ISD::SELECT_CC:
17835 if (SDValue V = combineSELECT_CCBitFloor(N, DAG))
17836 return V;
17837 return DAGCombineTruncBoolExt(N, DCI);
17838 case ISD::SINT_TO_FP:
17839 case ISD::UINT_TO_FP:
17840 return combineFPToIntToFP(N, DCI);
17841 case ISD::VECTOR_SHUFFLE:
17842 if (ISD::isNormalLoad(N: N->getOperand(Num: 0).getNode())) {
17843 LSBaseSDNode* LSBase = cast<LSBaseSDNode>(Val: N->getOperand(Num: 0));
17844 return combineVReverseMemOP(SVN: cast<ShuffleVectorSDNode>(Val: N), LSBase, DCI);
17845 }
17846 return combineVectorShuffle(SVN: cast<ShuffleVectorSDNode>(Val: N), DAG&: DCI.DAG);
17847 case ISD::STORE: {
17848
17849 EVT Op1VT = N->getOperand(Num: 1).getValueType();
17850 unsigned Opcode = N->getOperand(Num: 1).getOpcode();
17851
17852 if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
17853 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) {
17854 SDValue Val = combineStoreFPToInt(N, DCI);
17855 if (Val)
17856 return Val;
17857 }
17858
17859 if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
17860 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1));
17861 SDValue Val= combineVReverseMemOP(SVN, LSBase: cast<LSBaseSDNode>(Val: N), DCI);
17862 if (Val)
17863 return Val;
17864 }
17865
17866 // Turn STORE (BSWAP) -> sthbrx/stwbrx.
17867 if (cast<StoreSDNode>(Val: N)->isUnindexed() && Opcode == ISD::BSWAP &&
17868 N->getOperand(Num: 1).getNode()->hasOneUse() &&
17869 (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
17870 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
17871
17872 // STBRX can only handle simple types and it makes no sense to store less
17873 // two bytes in byte-reversed order.
17874 EVT mVT = cast<StoreSDNode>(Val: N)->getMemoryVT();
17875 if (mVT.isExtended() || mVT.getSizeInBits() < 16)
17876 break;
17877
17878 SDValue BSwapOp = N->getOperand(Num: 1).getOperand(i: 0);
17879 // Do an any-extend to 32-bits if this is a half-word input.
17880 if (BSwapOp.getValueType() == MVT::i16)
17881 BSwapOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: BSwapOp);
17882
17883 // If the type of BSWAP operand is wider than stored memory width
17884 // it need to be shifted to the right side before STBRX.
17885 if (Op1VT.bitsGT(VT: mVT)) {
17886 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
17887 BSwapOp = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op1VT, N1: BSwapOp,
17888 N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
17889 // Need to truncate if this is a bswap of i64 stored as i32/i16.
17890 if (Op1VT == MVT::i64)
17891 BSwapOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: BSwapOp);
17892 }
17893
17894 SDValue Ops[] = {
17895 N->getOperand(Num: 0), BSwapOp, N->getOperand(Num: 2), DAG.getValueType(mVT)
17896 };
17897 return
17898 DAG.getMemIntrinsicNode(Opcode: PPCISD::STBRX, dl, VTList: DAG.getVTList(VT: MVT::Other),
17899 Ops, MemVT: cast<StoreSDNode>(Val: N)->getMemoryVT(),
17900 MMO: cast<StoreSDNode>(Val: N)->getMemOperand());
17901 }
17902
17903 // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
17904 // So it can increase the chance of CSE constant construction.
17905 if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
17906 isa<ConstantSDNode>(Val: N->getOperand(Num: 1)) && Op1VT == MVT::i32) {
17907 // Need to sign-extended to 64-bits to handle negative values.
17908 EVT MemVT = cast<StoreSDNode>(Val: N)->getMemoryVT();
17909 uint64_t Val64 = SignExtend64(X: N->getConstantOperandVal(Num: 1),
17910 B: MemVT.getSizeInBits());
17911 SDValue Const64 = DAG.getConstant(Val: Val64, DL: dl, VT: MVT::i64);
17912
17913 auto *ST = cast<StoreSDNode>(Val: N);
17914 SDValue NewST = DAG.getStore(Chain: ST->getChain(), dl, Val: Const64,
17915 Ptr: ST->getBasePtr(), Offset: ST->getOffset(), SVT: MemVT,
17916 MMO: ST->getMemOperand(), AM: ST->getAddressingMode(),
17917 /*IsTruncating=*/true);
17918 // Note we use CombineTo here to prevent DAGCombiner from visiting the
17919 // new store which will change the constant by removing non-demanded bits.
17920 return ST->isUnindexed()
17921 ? DCI.CombineTo(N, Res: NewST, /*AddTo=*/false)
17922 : DCI.CombineTo(N, Res0: NewST, Res1: NewST.getValue(R: 1), /*AddTo=*/false);
17923 }
17924
17925 // For little endian, VSX stores require generating xxswapd/lxvd2x.
17926 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
17927 if (Op1VT.isSimple()) {
17928 MVT StoreVT = Op1VT.getSimpleVT();
17929 if (Subtarget.needsSwapsForVSXMemOps() &&
17930 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
17931 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
17932 return expandVSXStoreForLE(N, DCI);
17933 }
17934 break;
17935 }
17936 case ISD::LOAD: {
17937 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
17938 EVT VT = LD->getValueType(ResNo: 0);
17939
17940 // For little endian, VSX loads require generating lxvd2x/xxswapd.
17941 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
17942 if (VT.isSimple()) {
17943 MVT LoadVT = VT.getSimpleVT();
17944 if (Subtarget.needsSwapsForVSXMemOps() &&
17945 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
17946 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
17947 return expandVSXLoadForLE(N, DCI);
17948 }
17949
17950 // We sometimes end up with a 64-bit integer load, from which we extract
17951 // two single-precision floating-point numbers. This happens with
17952 // std::complex<float>, and other similar structures, because of the way we
17953 // canonicalize structure copies. However, if we lack direct moves,
17954 // then the final bitcasts from the extracted integer values to the
17955 // floating-point numbers turn into store/load pairs. Even with direct moves,
17956 // just loading the two floating-point numbers is likely better.
17957 auto ReplaceTwoFloatLoad = [&]() {
17958 if (VT != MVT::i64)
17959 return false;
17960
17961 if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
17962 LD->isVolatile())
17963 return false;
17964
17965 // We're looking for a sequence like this:
17966 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
17967 // t16: i64 = srl t13, Constant:i32<32>
17968 // t17: i32 = truncate t16
17969 // t18: f32 = bitcast t17
17970 // t19: i32 = truncate t13
17971 // t20: f32 = bitcast t19
17972
17973 if (!LD->hasNUsesOfValue(NUses: 2, Value: 0))
17974 return false;
17975
17976 auto UI = LD->user_begin();
17977 while (UI.getUse().getResNo() != 0) ++UI;
17978 SDNode *Trunc = *UI++;
17979 while (UI.getUse().getResNo() != 0) ++UI;
17980 SDNode *RightShift = *UI;
17981 if (Trunc->getOpcode() != ISD::TRUNCATE)
17982 std::swap(a&: Trunc, b&: RightShift);
17983
17984 if (Trunc->getOpcode() != ISD::TRUNCATE ||
17985 Trunc->getValueType(ResNo: 0) != MVT::i32 ||
17986 !Trunc->hasOneUse())
17987 return false;
17988 if (RightShift->getOpcode() != ISD::SRL ||
17989 !isa<ConstantSDNode>(Val: RightShift->getOperand(Num: 1)) ||
17990 RightShift->getConstantOperandVal(Num: 1) != 32 ||
17991 !RightShift->hasOneUse())
17992 return false;
17993
17994 SDNode *Trunc2 = *RightShift->user_begin();
17995 if (Trunc2->getOpcode() != ISD::TRUNCATE ||
17996 Trunc2->getValueType(ResNo: 0) != MVT::i32 ||
17997 !Trunc2->hasOneUse())
17998 return false;
17999
18000 SDNode *Bitcast = *Trunc->user_begin();
18001 SDNode *Bitcast2 = *Trunc2->user_begin();
18002
18003 if (Bitcast->getOpcode() != ISD::BITCAST ||
18004 Bitcast->getValueType(ResNo: 0) != MVT::f32)
18005 return false;
18006 if (Bitcast2->getOpcode() != ISD::BITCAST ||
18007 Bitcast2->getValueType(ResNo: 0) != MVT::f32)
18008 return false;
18009
18010 if (Subtarget.isLittleEndian())
18011 std::swap(a&: Bitcast, b&: Bitcast2);
18012
18013 // Bitcast has the second float (in memory-layout order) and Bitcast2
18014 // has the first one.
18015
18016 SDValue BasePtr = LD->getBasePtr();
18017 if (LD->isIndexed()) {
18018 assert(LD->getAddressingMode() == ISD::PRE_INC &&
18019 "Non-pre-inc AM on PPC?");
18020 BasePtr =
18021 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
18022 N2: LD->getOffset());
18023 }
18024
18025 auto MMOFlags =
18026 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
18027 SDValue FloatLoad = DAG.getLoad(VT: MVT::f32, dl, Chain: LD->getChain(), Ptr: BasePtr,
18028 PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign(),
18029 MMOFlags, AAInfo: LD->getAAInfo());
18030 SDValue AddPtr =
18031 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(),
18032 N1: BasePtr, N2: DAG.getIntPtrConstant(Val: 4, DL: dl));
18033 SDValue FloatLoad2 = DAG.getLoad(
18034 VT: MVT::f32, dl, Chain: SDValue(FloatLoad.getNode(), 1), Ptr: AddPtr,
18035 PtrInfo: LD->getPointerInfo().getWithOffset(O: 4),
18036 Alignment: commonAlignment(A: LD->getAlign(), Offset: 4), MMOFlags, AAInfo: LD->getAAInfo());
18037
18038 if (LD->isIndexed()) {
18039 // Note that DAGCombine should re-form any pre-increment load(s) from
18040 // what is produced here if that makes sense.
18041 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: BasePtr);
18042 }
18043
18044 DCI.CombineTo(N: Bitcast2, Res: FloatLoad);
18045 DCI.CombineTo(N: Bitcast, Res: FloatLoad2);
18046
18047 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, LD->isIndexed() ? 2 : 1),
18048 To: SDValue(FloatLoad2.getNode(), 1));
18049 return true;
18050 };
18051
18052 if (ReplaceTwoFloatLoad())
18053 return SDValue(N, 0);
18054
18055 EVT MemVT = LD->getMemoryVT();
18056 Type *Ty = MemVT.getTypeForEVT(Context&: *DAG.getContext());
18057 Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
18058 if (LD->isUnindexed() && VT.isVector() &&
18059 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
18060 // P8 and later hardware should just use LOAD.
18061 !Subtarget.hasP8Vector() &&
18062 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
18063 VT == MVT::v4f32))) &&
18064 LD->getAlign() < ABIAlignment) {
18065 // This is a type-legal unaligned Altivec load.
18066 SDValue Chain = LD->getChain();
18067 SDValue Ptr = LD->getBasePtr();
18068 bool isLittleEndian = Subtarget.isLittleEndian();
18069
18070 // This implements the loading of unaligned vectors as described in
18071 // the venerable Apple Velocity Engine overview. Specifically:
18072 // https://developer.apple.com/hardwaredrivers/ve/alignment.html
18073 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
18074 //
18075 // The general idea is to expand a sequence of one or more unaligned
18076 // loads into an alignment-based permutation-control instruction (lvsl
18077 // or lvsr), a series of regular vector loads (which always truncate
18078 // their input address to an aligned address), and a series of
18079 // permutations. The results of these permutations are the requested
18080 // loaded values. The trick is that the last "extra" load is not taken
18081 // from the address you might suspect (sizeof(vector) bytes after the
18082 // last requested load), but rather sizeof(vector) - 1 bytes after the
18083 // last requested vector. The point of this is to avoid a page fault if
18084 // the base address happened to be aligned. This works because if the
18085 // base address is aligned, then adding less than a full vector length
18086 // will cause the last vector in the sequence to be (re)loaded.
18087 // Otherwise, the next vector will be fetched as you might suspect was
18088 // necessary.
18089
18090 // We might be able to reuse the permutation generation from
18091 // a different base address offset from this one by an aligned amount.
18092 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
18093 // optimization later.
18094 Intrinsic::ID Intr, IntrLD, IntrPerm;
18095 MVT PermCntlTy, PermTy, LDTy;
18096 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
18097 : Intrinsic::ppc_altivec_lvsl;
18098 IntrLD = Intrinsic::ppc_altivec_lvx;
18099 IntrPerm = Intrinsic::ppc_altivec_vperm;
18100 PermCntlTy = MVT::v16i8;
18101 PermTy = MVT::v4i32;
18102 LDTy = MVT::v4i32;
18103
18104 SDValue PermCntl = BuildIntrinsicOp(IID: Intr, Op: Ptr, DAG, dl, DestVT: PermCntlTy);
18105
18106 // Create the new MMO for the new base load. It is like the original MMO,
18107 // but represents an area in memory almost twice the vector size centered
18108 // on the original address. If the address is unaligned, we might start
18109 // reading up to (sizeof(vector)-1) bytes below the address of the
18110 // original unaligned load.
18111 MachineFunction &MF = DAG.getMachineFunction();
18112 MachineMemOperand *BaseMMO =
18113 MF.getMachineMemOperand(MMO: LD->getMemOperand(),
18114 Offset: -(int64_t)MemVT.getStoreSize()+1,
18115 Size: 2*MemVT.getStoreSize()-1);
18116
18117 // Create the new base load.
18118 SDValue LDXIntID =
18119 DAG.getTargetConstant(Val: IntrLD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()));
18120 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
18121 SDValue BaseLoad =
18122 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
18123 VTList: DAG.getVTList(VT1: PermTy, VT2: MVT::Other),
18124 Ops: BaseLoadOps, MemVT: LDTy, MMO: BaseMMO);
18125
18126 // Note that the value of IncOffset (which is provided to the next
18127 // load's pointer info offset value, and thus used to calculate the
18128 // alignment), and the value of IncValue (which is actually used to
18129 // increment the pointer value) are different! This is because we
18130 // require the next load to appear to be aligned, even though it
18131 // is actually offset from the base pointer by a lesser amount.
18132 int IncOffset = VT.getSizeInBits() / 8;
18133 int IncValue = IncOffset;
18134
18135 // Walk (both up and down) the chain looking for another load at the real
18136 // (aligned) offset (the alignment of the other load does not matter in
18137 // this case). If found, then do not use the offset reduction trick, as
18138 // that will prevent the loads from being later combined (as they would
18139 // otherwise be duplicates).
18140 if (!findConsecutiveLoad(LD, DAG))
18141 --IncValue;
18142
18143 SDValue Increment =
18144 DAG.getConstant(Val: IncValue, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()));
18145 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Ptr.getValueType(), N1: Ptr, N2: Increment);
18146
18147 MachineMemOperand *ExtraMMO =
18148 MF.getMachineMemOperand(MMO: LD->getMemOperand(),
18149 Offset: 1, Size: 2*MemVT.getStoreSize()-1);
18150 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
18151 SDValue ExtraLoad =
18152 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
18153 VTList: DAG.getVTList(VT1: PermTy, VT2: MVT::Other),
18154 Ops: ExtraLoadOps, MemVT: LDTy, MMO: ExtraMMO);
18155
18156 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other,
18157 N1: BaseLoad.getValue(R: 1), N2: ExtraLoad.getValue(R: 1));
18158
18159 // Because vperm has a big-endian bias, we must reverse the order
18160 // of the input vectors and complement the permute control vector
18161 // when generating little endian code. We have already handled the
18162 // latter by using lvsr instead of lvsl, so just reverse BaseLoad
18163 // and ExtraLoad here.
18164 SDValue Perm;
18165 if (isLittleEndian)
18166 Perm = BuildIntrinsicOp(IID: IntrPerm,
18167 Op0: ExtraLoad, Op1: BaseLoad, Op2: PermCntl, DAG, dl);
18168 else
18169 Perm = BuildIntrinsicOp(IID: IntrPerm,
18170 Op0: BaseLoad, Op1: ExtraLoad, Op2: PermCntl, DAG, dl);
18171
18172 if (VT != PermTy)
18173 Perm = Subtarget.hasAltivec()
18174 ? DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Perm)
18175 : DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: Perm,
18176 N2: DAG.getTargetConstant(Val: 1, DL: dl, VT: MVT::i64));
18177 // second argument is 1 because this rounding
18178 // is always exact.
18179
18180 // The output of the permutation is our loaded result, the TokenFactor is
18181 // our new chain.
18182 DCI.CombineTo(N, Res0: Perm, Res1: TF);
18183 return SDValue(N, 0);
18184 }
18185 }
18186 break;
18187 case ISD::INTRINSIC_WO_CHAIN: {
18188 bool isLittleEndian = Subtarget.isLittleEndian();
18189 unsigned IID = N->getConstantOperandVal(Num: 0);
18190 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
18191 : Intrinsic::ppc_altivec_lvsl);
18192 if (IID == Intr && N->getOperand(Num: 1)->getOpcode() == ISD::ADD) {
18193 SDValue Add = N->getOperand(Num: 1);
18194
18195 int Bits = 4 /* 16 byte alignment */;
18196
18197 if (DAG.MaskedValueIsZero(Op: Add->getOperand(Num: 1),
18198 Mask: APInt::getAllOnes(numBits: Bits /* alignment */)
18199 .zext(width: Add.getScalarValueSizeInBits()))) {
18200 SDNode *BasePtr = Add->getOperand(Num: 0).getNode();
18201 for (SDNode *U : BasePtr->users()) {
18202 if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18203 U->getConstantOperandVal(Num: 0) == IID) {
18204 // We've found another LVSL/LVSR, and this address is an aligned
18205 // multiple of that one. The results will be the same, so use the
18206 // one we've just found instead.
18207
18208 return SDValue(U, 0);
18209 }
18210 }
18211 }
18212
18213 if (isa<ConstantSDNode>(Val: Add->getOperand(Num: 1))) {
18214 SDNode *BasePtr = Add->getOperand(Num: 0).getNode();
18215 for (SDNode *U : BasePtr->users()) {
18216 if (U->getOpcode() == ISD::ADD &&
18217 isa<ConstantSDNode>(Val: U->getOperand(Num: 1)) &&
18218 (Add->getConstantOperandVal(Num: 1) - U->getConstantOperandVal(Num: 1)) %
18219 (1ULL << Bits) ==
18220 0) {
18221 SDNode *OtherAdd = U;
18222 for (SDNode *V : OtherAdd->users()) {
18223 if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18224 V->getConstantOperandVal(Num: 0) == IID) {
18225 return SDValue(V, 0);
18226 }
18227 }
18228 }
18229 }
18230 }
18231 }
18232
18233 // Combine vmaxsw/h/b(a, a's negation) to abs(a)
18234 // Expose the vabsduw/h/b opportunity for down stream
18235 if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
18236 (IID == Intrinsic::ppc_altivec_vmaxsw ||
18237 IID == Intrinsic::ppc_altivec_vmaxsh ||
18238 IID == Intrinsic::ppc_altivec_vmaxsb)) {
18239 SDValue V1 = N->getOperand(Num: 1);
18240 SDValue V2 = N->getOperand(Num: 2);
18241 if ((V1.getSimpleValueType() == MVT::v4i32 ||
18242 V1.getSimpleValueType() == MVT::v8i16 ||
18243 V1.getSimpleValueType() == MVT::v16i8) &&
18244 V1.getSimpleValueType() == V2.getSimpleValueType()) {
18245 // (0-a, a)
18246 if (V1.getOpcode() == ISD::SUB &&
18247 ISD::isBuildVectorAllZeros(N: V1.getOperand(i: 0).getNode()) &&
18248 V1.getOperand(i: 1) == V2) {
18249 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V2.getValueType(), Operand: V2);
18250 }
18251 // (a, 0-a)
18252 if (V2.getOpcode() == ISD::SUB &&
18253 ISD::isBuildVectorAllZeros(N: V2.getOperand(i: 0).getNode()) &&
18254 V2.getOperand(i: 1) == V1) {
18255 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V1.getValueType(), Operand: V1);
18256 }
18257 // (x-y, y-x)
18258 if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
18259 V1.getOperand(i: 0) == V2.getOperand(i: 1) &&
18260 V1.getOperand(i: 1) == V2.getOperand(i: 0)) {
18261 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V1.getValueType(), Operand: V1);
18262 }
18263 }
18264 }
18265 }
18266
18267 break;
18268 case ISD::INTRINSIC_W_CHAIN:
18269 switch (N->getConstantOperandVal(Num: 1)) {
18270 default:
18271 break;
18272 case Intrinsic::ppc_altivec_vsum4sbs:
18273 case Intrinsic::ppc_altivec_vsum4shs:
18274 case Intrinsic::ppc_altivec_vsum4ubs: {
18275 // These sum-across intrinsics only have a chain due to the side effect
18276 // that they may set the SAT bit. If we know the SAT bit will not be set
18277 // for some inputs, we can replace any uses of their chain with the
18278 // input chain.
18279 if (BuildVectorSDNode *BVN =
18280 dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 3))) {
18281 APInt APSplatBits, APSplatUndef;
18282 unsigned SplatBitSize;
18283 bool HasAnyUndefs;
18284 bool BVNIsConstantSplat = BVN->isConstantSplat(
18285 SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize, HasAnyUndefs, MinSplatBits: 0,
18286 isBigEndian: !Subtarget.isLittleEndian());
18287 // If the constant splat vector is 0, the SAT bit will not be set.
18288 if (BVNIsConstantSplat && APSplatBits == 0)
18289 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 1), To: N->getOperand(Num: 0));
18290 }
18291 return SDValue();
18292 }
18293 case Intrinsic::ppc_vsx_lxvw4x:
18294 case Intrinsic::ppc_vsx_lxvd2x:
18295 // For little endian, VSX loads require generating lxvd2x/xxswapd.
18296 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
18297 if (Subtarget.needsSwapsForVSXMemOps())
18298 return expandVSXLoadForLE(N, DCI);
18299 break;
18300 }
18301 break;
18302 case ISD::INTRINSIC_VOID:
18303 // For little endian, VSX stores require generating xxswapd/stxvd2x.
18304 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
18305 if (Subtarget.needsSwapsForVSXMemOps()) {
18306 switch (N->getConstantOperandVal(Num: 1)) {
18307 default:
18308 break;
18309 case Intrinsic::ppc_vsx_stxvw4x:
18310 case Intrinsic::ppc_vsx_stxvd2x:
18311 return expandVSXStoreForLE(N, DCI);
18312 }
18313 }
18314 break;
18315 case ISD::BSWAP: {
18316 // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
18317 // For subtargets without LDBRX, we can still do better than the default
18318 // expansion even for 64-bit BSWAP (LOAD).
18319 bool Is64BitBswapOn64BitTgt =
18320 Subtarget.isPPC64() && N->getValueType(ResNo: 0) == MVT::i64;
18321 bool IsSingleUseNormalLd = ISD::isNormalLoad(N: N->getOperand(Num: 0).getNode()) &&
18322 N->getOperand(Num: 0).hasOneUse();
18323 if (IsSingleUseNormalLd &&
18324 (N->getValueType(ResNo: 0) == MVT::i32 || N->getValueType(ResNo: 0) == MVT::i16 ||
18325 (Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
18326 SDValue Load = N->getOperand(Num: 0);
18327 LoadSDNode *LD = cast<LoadSDNode>(Val&: Load);
18328 // Create the byte-swapping load.
18329 SDValue Ops[] = {
18330 LD->getChain(), // Chain
18331 LD->getBasePtr(), // Ptr
18332 DAG.getValueType(N->getValueType(ResNo: 0)) // VT
18333 };
18334 SDValue BSLoad =
18335 DAG.getMemIntrinsicNode(Opcode: PPCISD::LBRX, dl,
18336 VTList: DAG.getVTList(VT1: N->getValueType(ResNo: 0) == MVT::i64 ?
18337 MVT::i64 : MVT::i32, VT2: MVT::Other),
18338 Ops, MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
18339
18340 // If this is an i16 load, insert the truncate.
18341 SDValue ResVal = BSLoad;
18342 if (N->getValueType(ResNo: 0) == MVT::i16)
18343 ResVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i16, Operand: BSLoad);
18344
18345 // First, combine the bswap away. This makes the value produced by the
18346 // load dead.
18347 DCI.CombineTo(N, Res: ResVal);
18348
18349 // Next, combine the load away, we give it a bogus result value but a real
18350 // chain result. The result value is dead because the bswap is dead.
18351 DCI.CombineTo(N: Load.getNode(), Res0: ResVal, Res1: BSLoad.getValue(R: 1));
18352
18353 // Return N so it doesn't get rechecked!
18354 return SDValue(N, 0);
18355 }
18356 // Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
18357 // before legalization so that the BUILD_PAIR is handled correctly.
18358 if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
18359 !IsSingleUseNormalLd)
18360 return SDValue();
18361 LoadSDNode *LD = cast<LoadSDNode>(Val: N->getOperand(Num: 0));
18362
18363 // Can't split volatile or atomic loads.
18364 if (!LD->isSimple())
18365 return SDValue();
18366 SDValue BasePtr = LD->getBasePtr();
18367 SDValue Lo = DAG.getLoad(VT: MVT::i32, dl, Chain: LD->getChain(), Ptr: BasePtr,
18368 PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign());
18369 Lo = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::i32, Operand: Lo);
18370 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
18371 N2: DAG.getIntPtrConstant(Val: 4, DL: dl));
18372 MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(
18373 MMO: LD->getMemOperand(), Offset: 4, Size: 4);
18374 SDValue Hi = DAG.getLoad(VT: MVT::i32, dl, Chain: LD->getChain(), Ptr: BasePtr, MMO: NewMMO);
18375 Hi = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::i32, Operand: Hi);
18376 SDValue Res;
18377 if (Subtarget.isLittleEndian())
18378 Res = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Hi, N2: Lo);
18379 else
18380 Res = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Lo, N2: Hi);
18381 SDValue TF =
18382 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other,
18383 N1: Hi.getOperand(i: 0).getValue(R: 1), N2: Lo.getOperand(i: 0).getValue(R: 1));
18384 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: TF);
18385 return Res;
18386 }
18387 case PPCISD::VCMP:
18388 // If a VCMP_rec node already exists with exactly the same operands as this
18389 // node, use its result instead of this node (VCMP_rec computes both a CR6
18390 // and a normal output).
18391 //
18392 if (!N->getOperand(Num: 0).hasOneUse() &&
18393 !N->getOperand(Num: 1).hasOneUse() &&
18394 !N->getOperand(Num: 2).hasOneUse()) {
18395
18396 // Scan all of the users of the LHS, looking for VCMP_rec's that match.
18397 SDNode *VCMPrecNode = nullptr;
18398
18399 SDNode *LHSN = N->getOperand(Num: 0).getNode();
18400 for (SDNode *User : LHSN->users())
18401 if (User->getOpcode() == PPCISD::VCMP_rec &&
18402 User->getOperand(Num: 1) == N->getOperand(Num: 1) &&
18403 User->getOperand(Num: 2) == N->getOperand(Num: 2) &&
18404 User->getOperand(Num: 0) == N->getOperand(Num: 0)) {
18405 VCMPrecNode = User;
18406 break;
18407 }
18408
18409 // If there is no VCMP_rec node, or if the flag value has a single use,
18410 // don't transform this.
18411 if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(NUses: 0, Value: 1))
18412 break;
18413
18414 // Look at the (necessarily single) use of the flag value. If it has a
18415 // chain, this transformation is more complex. Note that multiple things
18416 // could use the value result, which we should ignore.
18417 SDNode *FlagUser = nullptr;
18418 for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
18419 FlagUser == nullptr; ++UI) {
18420 assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
18421 SDNode *User = UI->getUser();
18422 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
18423 if (User->getOperand(Num: i) == SDValue(VCMPrecNode, 1)) {
18424 FlagUser = User;
18425 break;
18426 }
18427 }
18428 }
18429
18430 // If the user is a MFOCRF instruction, we know this is safe.
18431 // Otherwise we give up for right now.
18432 if (FlagUser->getOpcode() == PPCISD::MFOCRF)
18433 return SDValue(VCMPrecNode, 0);
18434 }
18435 break;
18436 case ISD::BR_CC: {
18437 // If this is a branch on an altivec predicate comparison, lower this so
18438 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This
18439 // lowering is done pre-legalize, because the legalizer lowers the predicate
18440 // compare down to code that is difficult to reassemble.
18441 // This code also handles branches that depend on the result of a store
18442 // conditional.
18443 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 1))->get();
18444 SDValue LHS = N->getOperand(Num: 2), RHS = N->getOperand(Num: 3);
18445
18446 int CompareOpc;
18447 bool isDot;
18448
18449 if (!isa<ConstantSDNode>(Val: RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE))
18450 break;
18451
18452 // Since we are doing this pre-legalize, the RHS can be a constant of
18453 // arbitrary bitwidth which may cause issues when trying to get the value
18454 // from the underlying APInt.
18455 auto RHSAPInt = RHS->getAsAPIntVal();
18456 if (!RHSAPInt.isIntN(N: 64))
18457 break;
18458
18459 unsigned Val = RHSAPInt.getZExtValue();
18460 auto isImpossibleCompare = [&]() {
18461 // If this is a comparison against something other than 0/1, then we know
18462 // that the condition is never/always true.
18463 if (Val != 0 && Val != 1) {
18464 if (CC == ISD::SETEQ) // Cond never true, remove branch.
18465 return N->getOperand(Num: 0);
18466 // Always !=, turn it into an unconditional branch.
18467 return DAG.getNode(Opcode: ISD::BR, DL: dl, VT: MVT::Other,
18468 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 4));
18469 }
18470 return SDValue();
18471 };
18472 // Combine branches fed by store conditional instructions (st[bhwd]cx).
18473 unsigned StoreWidth = 0;
18474 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
18475 isStoreConditional(Intrin: LHS, StoreWidth)) {
18476 if (SDValue Impossible = isImpossibleCompare())
18477 return Impossible;
18478 PPC::Predicate CompOpc;
18479 // eq 0 => ne
18480 // ne 0 => eq
18481 // eq 1 => eq
18482 // ne 1 => ne
18483 if (Val == 0)
18484 CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ;
18485 else
18486 CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE;
18487
18488 SDValue Ops[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 2), LHS.getOperand(i: 3),
18489 DAG.getConstant(Val: StoreWidth, DL: dl, VT: MVT::i32)};
18490 auto *MemNode = cast<MemSDNode>(Val&: LHS);
18491 SDValue ConstSt = DAG.getMemIntrinsicNode(
18492 Opcode: PPCISD::STORE_COND, dl,
18493 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other, VT3: MVT::Glue), Ops,
18494 MemVT: MemNode->getMemoryVT(), MMO: MemNode->getMemOperand());
18495
18496 SDValue InChain;
18497 // Unchain the branch from the original store conditional.
18498 if (N->getOperand(Num: 0) == LHS.getValue(R: 1))
18499 InChain = LHS.getOperand(i: 0);
18500 else if (N->getOperand(Num: 0).getOpcode() == ISD::TokenFactor) {
18501 SmallVector<SDValue, 4> InChains;
18502 SDValue InTF = N->getOperand(Num: 0);
18503 for (int i = 0, e = InTF.getNumOperands(); i < e; i++)
18504 if (InTF.getOperand(i) != LHS.getValue(R: 1))
18505 InChains.push_back(Elt: InTF.getOperand(i));
18506 InChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: InChains);
18507 }
18508
18509 return DAG.getNode(Opcode: PPCISD::COND_BRANCH, DL: dl, VT: MVT::Other, N1: InChain,
18510 N2: DAG.getConstant(Val: CompOpc, DL: dl, VT: MVT::i32),
18511 N3: DAG.getRegister(Reg: PPC::CR0, VT: MVT::i32), N4: N->getOperand(Num: 4),
18512 N5: ConstSt.getValue(R: 2));
18513 }
18514
18515 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18516 getVectorCompareInfo(Intrin: LHS, CompareOpc, isDot, Subtarget)) {
18517 assert(isDot && "Can't compare against a vector result!");
18518
18519 if (SDValue Impossible = isImpossibleCompare())
18520 return Impossible;
18521
18522 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
18523 // Create the PPCISD altivec 'dot' comparison node.
18524 SDValue Ops[] = {
18525 LHS.getOperand(i: 2), // LHS of compare
18526 LHS.getOperand(i: 3), // RHS of compare
18527 DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32)
18528 };
18529 EVT VTs[] = { LHS.getOperand(i: 2).getValueType(), MVT::Glue };
18530 SDValue CompNode = DAG.getNode(Opcode: PPCISD::VCMP_rec, DL: dl, ResultTys: VTs, Ops);
18531
18532 // Unpack the result based on how the target uses it.
18533 PPC::Predicate CompOpc;
18534 switch (LHS.getConstantOperandVal(i: 1)) {
18535 default: // Can't happen, don't crash on invalid number though.
18536 case 0: // Branch on the value of the EQ bit of CR6.
18537 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
18538 break;
18539 case 1: // Branch on the inverted value of the EQ bit of CR6.
18540 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
18541 break;
18542 case 2: // Branch on the value of the LT bit of CR6.
18543 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
18544 break;
18545 case 3: // Branch on the inverted value of the LT bit of CR6.
18546 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
18547 break;
18548 }
18549
18550 return DAG.getNode(Opcode: PPCISD::COND_BRANCH, DL: dl, VT: MVT::Other, N1: N->getOperand(Num: 0),
18551 N2: DAG.getConstant(Val: CompOpc, DL: dl, VT: MVT::i32),
18552 N3: DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32),
18553 N4: N->getOperand(Num: 4), N5: CompNode.getValue(R: 1));
18554 }
18555 break;
18556 }
18557 case ISD::BUILD_VECTOR:
18558 return DAGCombineBuildVector(N, DCI);
18559 case PPCISD::ADDC:
18560 return DAGCombineAddc(N, DCI);
18561
18562 case ISD::BITCAST:
18563 return DAGCombineBitcast(N, DCI);
18564 }
18565
18566 return SDValue();
18567}
18568
18569SDValue
18570PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18571 SelectionDAG &DAG,
18572 SmallVectorImpl<SDNode *> &Created) const {
18573 // fold (sdiv X, pow2)
18574 EVT VT = N->getValueType(ResNo: 0);
18575 if (VT == MVT::i64 && !Subtarget.isPPC64())
18576 return SDValue();
18577 if ((VT != MVT::i32 && VT != MVT::i64) ||
18578 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18579 return SDValue();
18580
18581 SDLoc DL(N);
18582 SDValue N0 = N->getOperand(Num: 0);
18583
18584 bool IsNegPow2 = Divisor.isNegatedPowerOf2();
18585 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero();
18586 SDValue ShiftAmt = DAG.getConstant(Val: Lg2, DL, VT);
18587
18588 SDValue Op = DAG.getNode(Opcode: PPCISD::SRA_ADDZE, DL, VT, N1: N0, N2: ShiftAmt);
18589 Created.push_back(Elt: Op.getNode());
18590
18591 if (IsNegPow2) {
18592 Op = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Op);
18593 Created.push_back(Elt: Op.getNode());
18594 }
18595
18596 return Op;
18597}
18598
18599//===----------------------------------------------------------------------===//
18600// Inline Assembly Support
18601//===----------------------------------------------------------------------===//
18602
18603void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18604 KnownBits &Known,
18605 const APInt &DemandedElts,
18606 const SelectionDAG &DAG,
18607 unsigned Depth) const {
18608 Known.resetAll();
18609 switch (Op.getOpcode()) {
18610 default: break;
18611 case PPCISD::LBRX: {
18612 // lhbrx is known to have the top bits cleared out.
18613 if (cast<VTSDNode>(Val: Op.getOperand(i: 2))->getVT() == MVT::i16)
18614 Known.Zero = 0xFFFF0000;
18615 break;
18616 }
18617 case PPCISD::ADDE: {
18618 if (Op.getResNo() == 0) {
18619 // (0|1), _ = ADDE 0, 0, CARRY
18620 SDValue LHS = Op.getOperand(i: 0);
18621 SDValue RHS = Op.getOperand(i: 1);
18622 if (isNullConstant(V: LHS) && isNullConstant(V: RHS))
18623 Known.Zero = ~1ULL;
18624 }
18625 break;
18626 }
18627 case ISD::INTRINSIC_WO_CHAIN: {
18628 switch (Op.getConstantOperandVal(i: 0)) {
18629 default: break;
18630 case Intrinsic::ppc_altivec_vcmpbfp_p:
18631 case Intrinsic::ppc_altivec_vcmpeqfp_p:
18632 case Intrinsic::ppc_altivec_vcmpequb_p:
18633 case Intrinsic::ppc_altivec_vcmpequh_p:
18634 case Intrinsic::ppc_altivec_vcmpequw_p:
18635 case Intrinsic::ppc_altivec_vcmpequd_p:
18636 case Intrinsic::ppc_altivec_vcmpequq_p:
18637 case Intrinsic::ppc_altivec_vcmpgefp_p:
18638 case Intrinsic::ppc_altivec_vcmpgtfp_p:
18639 case Intrinsic::ppc_altivec_vcmpgtsb_p:
18640 case Intrinsic::ppc_altivec_vcmpgtsh_p:
18641 case Intrinsic::ppc_altivec_vcmpgtsw_p:
18642 case Intrinsic::ppc_altivec_vcmpgtsd_p:
18643 case Intrinsic::ppc_altivec_vcmpgtsq_p:
18644 case Intrinsic::ppc_altivec_vcmpgtub_p:
18645 case Intrinsic::ppc_altivec_vcmpgtuh_p:
18646 case Intrinsic::ppc_altivec_vcmpgtuw_p:
18647 case Intrinsic::ppc_altivec_vcmpgtud_p:
18648 case Intrinsic::ppc_altivec_vcmpgtuq_p:
18649 Known.Zero = ~1U; // All bits but the low one are known to be zero.
18650 break;
18651 }
18652 break;
18653 }
18654 case ISD::INTRINSIC_W_CHAIN: {
18655 switch (Op.getConstantOperandVal(i: 1)) {
18656 default:
18657 break;
18658 case Intrinsic::ppc_load2r:
18659 // Top bits are cleared for load2r (which is the same as lhbrx).
18660 Known.Zero = 0xFFFF0000;
18661 break;
18662 }
18663 break;
18664 }
18665 }
18666}
18667
18668Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
18669 switch (Subtarget.getCPUDirective()) {
18670 default: break;
18671 case PPC::DIR_970:
18672 case PPC::DIR_PWR4:
18673 case PPC::DIR_PWR5:
18674 case PPC::DIR_PWR5X:
18675 case PPC::DIR_PWR6:
18676 case PPC::DIR_PWR6X:
18677 case PPC::DIR_PWR7:
18678 case PPC::DIR_PWR8:
18679 case PPC::DIR_PWR9:
18680 case PPC::DIR_PWR10:
18681 case PPC::DIR_PWR11:
18682 case PPC::DIR_PWR_FUTURE: {
18683 if (!ML)
18684 break;
18685
18686 if (!DisableInnermostLoopAlign32) {
18687 // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
18688 // so that we can decrease cache misses and branch-prediction misses.
18689 // Actual alignment of the loop will depend on the hotness check and other
18690 // logic in alignBlocks.
18691 if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
18692 return Align(32);
18693 }
18694
18695 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
18696
18697 // For small loops (between 5 and 8 instructions), align to a 32-byte
18698 // boundary so that the entire loop fits in one instruction-cache line.
18699 uint64_t LoopSize = 0;
18700 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
18701 for (const MachineInstr &J : **I) {
18702 LoopSize += TII->getInstSizeInBytes(MI: J);
18703 if (LoopSize > 32)
18704 break;
18705 }
18706
18707 if (LoopSize > 16 && LoopSize <= 32)
18708 return Align(32);
18709
18710 break;
18711 }
18712 }
18713
18714 return TargetLowering::getPrefLoopAlignment(ML);
18715}
18716
18717/// getConstraintType - Given a constraint, return the type of
18718/// constraint it is for this target.
18719PPCTargetLowering::ConstraintType
18720PPCTargetLowering::getConstraintType(StringRef Constraint) const {
18721 if (Constraint.size() == 1) {
18722 switch (Constraint[0]) {
18723 default: break;
18724 case 'b':
18725 case 'r':
18726 case 'f':
18727 case 'd':
18728 case 'v':
18729 case 'y':
18730 return C_RegisterClass;
18731 case 'Z':
18732 // FIXME: While Z does indicate a memory constraint, it specifically
18733 // indicates an r+r address (used in conjunction with the 'y' modifier
18734 // in the replacement string). Currently, we're forcing the base
18735 // register to be r0 in the asm printer (which is interpreted as zero)
18736 // and forming the complete address in the second register. This is
18737 // suboptimal.
18738 return C_Memory;
18739 }
18740 } else if (Constraint == "wc") { // individual CR bits.
18741 return C_RegisterClass;
18742 } else if (Constraint == "wa" || Constraint == "wd" ||
18743 Constraint == "wf" || Constraint == "ws" ||
18744 Constraint == "wi" || Constraint == "ww") {
18745 return C_RegisterClass; // VSX registers.
18746 }
18747 return TargetLowering::getConstraintType(Constraint);
18748}
18749
18750/// Examine constraint type and operand type and determine a weight value.
18751/// This object must already have been set up with the operand type
18752/// and the current alternative constraint selected.
18753TargetLowering::ConstraintWeight
18754PPCTargetLowering::getSingleConstraintMatchWeight(
18755 AsmOperandInfo &info, const char *constraint) const {
18756 ConstraintWeight weight = CW_Invalid;
18757 Value *CallOperandVal = info.CallOperandVal;
18758 // If we don't have a value, we can't do a match,
18759 // but allow it at the lowest weight.
18760 if (!CallOperandVal)
18761 return CW_Default;
18762 Type *type = CallOperandVal->getType();
18763
18764 // Look at the constraint type.
18765 if (StringRef(constraint) == "wc" && type->isIntegerTy(BitWidth: 1))
18766 return CW_Register; // an individual CR bit.
18767 else if ((StringRef(constraint) == "wa" ||
18768 StringRef(constraint) == "wd" ||
18769 StringRef(constraint) == "wf") &&
18770 type->isVectorTy())
18771 return CW_Register;
18772 else if (StringRef(constraint) == "wi" && type->isIntegerTy(BitWidth: 64))
18773 return CW_Register; // just hold 64-bit integers data.
18774 else if (StringRef(constraint) == "ws" && type->isDoubleTy())
18775 return CW_Register;
18776 else if (StringRef(constraint) == "ww" && type->isFloatTy())
18777 return CW_Register;
18778
18779 switch (*constraint) {
18780 default:
18781 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
18782 break;
18783 case 'b':
18784 if (type->isIntegerTy())
18785 weight = CW_Register;
18786 break;
18787 case 'f':
18788 if (type->isFloatTy())
18789 weight = CW_Register;
18790 break;
18791 case 'd':
18792 if (type->isDoubleTy())
18793 weight = CW_Register;
18794 break;
18795 case 'v':
18796 if (type->isVectorTy())
18797 weight = CW_Register;
18798 break;
18799 case 'y':
18800 weight = CW_Register;
18801 break;
18802 case 'Z':
18803 weight = CW_Memory;
18804 break;
18805 }
18806 return weight;
18807}
18808
18809std::pair<unsigned, const TargetRegisterClass *>
18810PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
18811 StringRef Constraint,
18812 MVT VT) const {
18813 if (Constraint.size() == 1) {
18814 // GCC RS6000 Constraint Letters
18815 switch (Constraint[0]) {
18816 case 'b': // R1-R31
18817 if (VT == MVT::i64 && Subtarget.isPPC64())
18818 return std::make_pair(x: 0U, y: &PPC::G8RC_NOX0RegClass);
18819 return std::make_pair(x: 0U, y: &PPC::GPRC_NOR0RegClass);
18820 case 'r': // R0-R31
18821 if (VT == MVT::i64 && Subtarget.isPPC64())
18822 return std::make_pair(x: 0U, y: &PPC::G8RCRegClass);
18823 return std::make_pair(x: 0U, y: &PPC::GPRCRegClass);
18824 // 'd' and 'f' constraints are both defined to be "the floating point
18825 // registers", where one is for 32-bit and the other for 64-bit. We don't
18826 // really care overly much here so just give them all the same reg classes.
18827 case 'd':
18828 case 'f':
18829 if (Subtarget.hasSPE()) {
18830 if (VT == MVT::f32 || VT == MVT::i32)
18831 return std::make_pair(x: 0U, y: &PPC::GPRCRegClass);
18832 if (VT == MVT::f64 || VT == MVT::i64)
18833 return std::make_pair(x: 0U, y: &PPC::SPERCRegClass);
18834 } else {
18835 if (VT == MVT::f32 || VT == MVT::i32)
18836 return std::make_pair(x: 0U, y: &PPC::F4RCRegClass);
18837 if (VT == MVT::f64 || VT == MVT::i64)
18838 return std::make_pair(x: 0U, y: &PPC::F8RCRegClass);
18839 }
18840 break;
18841 case 'v':
18842 if (Subtarget.hasAltivec() && VT.isVector())
18843 return std::make_pair(x: 0U, y: &PPC::VRRCRegClass);
18844 else if (Subtarget.hasVSX())
18845 // Scalars in Altivec registers only make sense with VSX.
18846 return std::make_pair(x: 0U, y: &PPC::VFRCRegClass);
18847 break;
18848 case 'y': // crrc
18849 return std::make_pair(x: 0U, y: &PPC::CRRCRegClass);
18850 }
18851 } else if (Constraint == "wc" && Subtarget.useCRBits()) {
18852 // An individual CR bit.
18853 return std::make_pair(x: 0U, y: &PPC::CRBITRCRegClass);
18854 } else if ((Constraint == "wa" || Constraint == "wd" ||
18855 Constraint == "wf" || Constraint == "wi") &&
18856 Subtarget.hasVSX()) {
18857 // A VSX register for either a scalar (FP) or vector. There is no
18858 // support for single precision scalars on subtargets prior to Power8.
18859 if (VT.isVector())
18860 return std::make_pair(x: 0U, y: &PPC::VSRCRegClass);
18861 if (VT == MVT::f32 && Subtarget.hasP8Vector())
18862 return std::make_pair(x: 0U, y: &PPC::VSSRCRegClass);
18863 return std::make_pair(x: 0U, y: &PPC::VSFRCRegClass);
18864 } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
18865 if (VT == MVT::f32 && Subtarget.hasP8Vector())
18866 return std::make_pair(x: 0U, y: &PPC::VSSRCRegClass);
18867 else
18868 return std::make_pair(x: 0U, y: &PPC::VSFRCRegClass);
18869 } else if (Constraint == "lr") {
18870 if (VT == MVT::i64)
18871 return std::make_pair(x: 0U, y: &PPC::LR8RCRegClass);
18872 else
18873 return std::make_pair(x: 0U, y: &PPC::LRRCRegClass);
18874 }
18875
18876 // Handle special cases of physical registers that are not properly handled
18877 // by the base class.
18878 if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
18879 // If we name a VSX register, we can't defer to the base class because it
18880 // will not recognize the correct register (their names will be VSL{0-31}
18881 // and V{0-31} so they won't match). So we match them here.
18882 if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
18883 int VSNum = atoi(nptr: Constraint.data() + 3);
18884 assert(VSNum >= 0 && VSNum <= 63 &&
18885 "Attempted to access a vsr out of range");
18886 if (VSNum < 32)
18887 return std::make_pair(x: PPC::VSL0 + VSNum, y: &PPC::VSRCRegClass);
18888 return std::make_pair(x: PPC::V0 + VSNum - 32, y: &PPC::VSRCRegClass);
18889 }
18890
18891 // For float registers, we can't defer to the base class as it will match
18892 // the SPILLTOVSRRC class.
18893 if (Constraint.size() > 3 && Constraint[1] == 'f') {
18894 int RegNum = atoi(nptr: Constraint.data() + 2);
18895 if (RegNum > 31 || RegNum < 0)
18896 report_fatal_error(reason: "Invalid floating point register number");
18897 if (VT == MVT::f32 || VT == MVT::i32)
18898 return Subtarget.hasSPE()
18899 ? std::make_pair(x: PPC::R0 + RegNum, y: &PPC::GPRCRegClass)
18900 : std::make_pair(x: PPC::F0 + RegNum, y: &PPC::F4RCRegClass);
18901 if (VT == MVT::f64 || VT == MVT::i64)
18902 return Subtarget.hasSPE()
18903 ? std::make_pair(x: PPC::S0 + RegNum, y: &PPC::SPERCRegClass)
18904 : std::make_pair(x: PPC::F0 + RegNum, y: &PPC::F8RCRegClass);
18905 }
18906 }
18907
18908 std::pair<unsigned, const TargetRegisterClass *> R =
18909 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18910
18911 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
18912 // (which we call X[0-9]+). If a 64-bit value has been requested, and a
18913 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
18914 // register.
18915 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
18916 // the AsmName field from *RegisterInfo.td, then this would not be necessary.
18917 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
18918 PPC::GPRCRegClass.contains(Reg: R.first))
18919 return std::make_pair(x: TRI->getMatchingSuperReg(Reg: R.first,
18920 SubIdx: PPC::sub_32, RC: &PPC::G8RCRegClass),
18921 y: &PPC::G8RCRegClass);
18922
18923 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
18924 if (!R.second && StringRef("{cc}").equals_insensitive(RHS: Constraint)) {
18925 R.first = PPC::CR0;
18926 R.second = &PPC::CRRCRegClass;
18927 }
18928 // FIXME: This warning should ideally be emitted in the front end.
18929 const auto &TM = getTargetMachine();
18930 if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {
18931 if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||
18932 (R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&
18933 (R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))
18934 errs() << "warning: vector registers 20 to 32 are reserved in the "
18935 "default AIX AltiVec ABI and cannot be used\n";
18936 }
18937
18938 return R;
18939}
18940
18941/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
18942/// vector. If it is invalid, don't add anything to Ops.
18943void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18944 StringRef Constraint,
18945 std::vector<SDValue> &Ops,
18946 SelectionDAG &DAG) const {
18947 SDValue Result;
18948
18949 // Only support length 1 constraints.
18950 if (Constraint.size() > 1)
18951 return;
18952
18953 char Letter = Constraint[0];
18954 switch (Letter) {
18955 default: break;
18956 case 'I':
18957 case 'J':
18958 case 'K':
18959 case 'L':
18960 case 'M':
18961 case 'N':
18962 case 'O':
18963 case 'P': {
18964 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Val&: Op);
18965 if (!CST) return; // Must be an immediate to match.
18966 SDLoc dl(Op);
18967 int64_t Value = CST->getSExtValue();
18968 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
18969 // numbers are printed as such.
18970 switch (Letter) {
18971 default: llvm_unreachable("Unknown constraint letter!");
18972 case 'I': // "I" is a signed 16-bit constant.
18973 if (isInt<16>(x: Value))
18974 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18975 break;
18976 case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
18977 if (isShiftedUInt<16, 16>(x: Value))
18978 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18979 break;
18980 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
18981 if (isShiftedInt<16, 16>(x: Value))
18982 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18983 break;
18984 case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
18985 if (isUInt<16>(x: Value))
18986 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18987 break;
18988 case 'M': // "M" is a constant that is greater than 31.
18989 if (Value > 31)
18990 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18991 break;
18992 case 'N': // "N" is a positive constant that is an exact power of two.
18993 if (Value > 0 && isPowerOf2_64(Value))
18994 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18995 break;
18996 case 'O': // "O" is the constant zero.
18997 if (Value == 0)
18998 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18999 break;
19000 case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
19001 if (isInt<16>(x: -Value))
19002 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
19003 break;
19004 }
19005 break;
19006 }
19007 }
19008
19009 if (Result.getNode()) {
19010 Ops.push_back(x: Result);
19011 return;
19012 }
19013
19014 // Handle standard constraint letters.
19015 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
19016}
19017
19018void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,
19019 SmallVectorImpl<SDValue> &Ops,
19020 SelectionDAG &DAG) const {
19021 if (I.getNumOperands() <= 1)
19022 return;
19023 if (!isa<ConstantSDNode>(Val: Ops[1].getNode()))
19024 return;
19025 auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();
19026 if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&
19027 IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)
19028 return;
19029
19030 if (MDNode *MDN = I.getMetadata(KindID: LLVMContext::MD_annotation))
19031 Ops.push_back(Elt: DAG.getMDNode(MD: MDN));
19032}
19033
19034// isLegalAddressingMode - Return true if the addressing mode represented
19035// by AM is legal for this target, for a load/store of the specified type.
19036bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
19037 const AddrMode &AM, Type *Ty,
19038 unsigned AS,
19039 Instruction *I) const {
19040 // Vector type r+i form is supported since power9 as DQ form. We don't check
19041 // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
19042 // imm form is preferred and the offset can be adjusted to use imm form later
19043 // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
19044 // max offset to check legal addressing mode, we should be a little aggressive
19045 // to contain other offsets for that LSRUse.
19046 if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
19047 return false;
19048
19049 // PPC allows a sign-extended 16-bit immediate field.
19050 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
19051 return false;
19052
19053 // No global is ever allowed as a base.
19054 if (AM.BaseGV)
19055 return false;
19056
19057 // PPC only support r+r,
19058 switch (AM.Scale) {
19059 case 0: // "r+i" or just "i", depending on HasBaseReg.
19060 break;
19061 case 1:
19062 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
19063 return false;
19064 // Otherwise we have r+r or r+i.
19065 break;
19066 case 2:
19067 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
19068 return false;
19069 // Allow 2*r as r+r.
19070 break;
19071 default:
19072 // No other scales are supported.
19073 return false;
19074 }
19075
19076 return true;
19077}
19078
19079SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
19080 SelectionDAG &DAG) const {
19081 MachineFunction &MF = DAG.getMachineFunction();
19082 MachineFrameInfo &MFI = MF.getFrameInfo();
19083 MFI.setReturnAddressIsTaken(true);
19084
19085 SDLoc dl(Op);
19086 unsigned Depth = Op.getConstantOperandVal(i: 0);
19087
19088 // Make sure the function does not optimize away the store of the RA to
19089 // the stack.
19090 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
19091 FuncInfo->setLRStoreRequired();
19092 auto PtrVT = getPointerTy(DL: MF.getDataLayout());
19093
19094 if (Depth > 0) {
19095 // The link register (return address) is saved in the caller's frame
19096 // not the callee's stack frame. So we must get the caller's frame
19097 // address and load the return address at the LR offset from there.
19098 SDValue FrameAddr =
19099 DAG.getLoad(VT: Op.getValueType(), dl, Chain: DAG.getEntryNode(),
19100 Ptr: LowerFRAMEADDR(Op, DAG), PtrInfo: MachinePointerInfo());
19101 SDValue Offset =
19102 DAG.getConstant(Val: Subtarget.getFrameLowering()->getReturnSaveOffset(), DL: dl,
19103 VT: Subtarget.getScalarIntVT());
19104 return DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(),
19105 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FrameAddr, N2: Offset),
19106 PtrInfo: MachinePointerInfo());
19107 }
19108
19109 // Just load the return address off the stack.
19110 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
19111 return DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: RetAddrFI,
19112 PtrInfo: MachinePointerInfo());
19113}
19114
19115SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
19116 SelectionDAG &DAG) const {
19117 SDLoc dl(Op);
19118 unsigned Depth = Op.getConstantOperandVal(i: 0);
19119
19120 MachineFunction &MF = DAG.getMachineFunction();
19121 MachineFrameInfo &MFI = MF.getFrameInfo();
19122 MFI.setFrameAddressIsTaken(true);
19123
19124 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
19125 bool isPPC64 = PtrVT == MVT::i64;
19126
19127 // Naked functions never have a frame pointer, and so we use r1. For all
19128 // other functions, this decision must be delayed until during PEI.
19129 unsigned FrameReg;
19130 if (MF.getFunction().hasFnAttribute(Kind: Attribute::Naked))
19131 FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
19132 else
19133 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
19134
19135 SDValue FrameAddr = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg: FrameReg,
19136 VT: PtrVT);
19137 while (Depth--)
19138 FrameAddr = DAG.getLoad(VT: Op.getValueType(), dl, Chain: DAG.getEntryNode(),
19139 Ptr: FrameAddr, PtrInfo: MachinePointerInfo());
19140 return FrameAddr;
19141}
19142
19143#define GET_REGISTER_MATCHER
19144#include "PPCGenAsmMatcher.inc"
19145
19146Register PPCTargetLowering::getRegisterByName(const char *RegName, LLT VT,
19147 const MachineFunction &MF) const {
19148 bool IsPPC64 = Subtarget.isPPC64();
19149
19150 bool Is64Bit = IsPPC64 && VT == LLT::scalar(SizeInBits: 64);
19151 if (!Is64Bit && VT != LLT::scalar(SizeInBits: 32))
19152 report_fatal_error(reason: "Invalid register global variable type");
19153
19154 Register Reg = MatchRegisterName(Name: RegName);
19155 if (!Reg)
19156 return Reg;
19157
19158 // FIXME: Unable to generate code for `-O2` but okay for `-O0`.
19159 // Need followup investigation as to why.
19160 if ((IsPPC64 && Reg == PPC::R2) || Reg == PPC::R0)
19161 report_fatal_error(reason: Twine("Trying to reserve an invalid register \"" +
19162 StringRef(RegName) + "\"."));
19163
19164 // Convert GPR to GP8R register for 64bit.
19165 if (Is64Bit && StringRef(RegName).starts_with_insensitive(Prefix: "r"))
19166 Reg = Reg.id() - PPC::R0 + PPC::X0;
19167
19168 return Reg;
19169}
19170
19171bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
19172 // 32-bit SVR4 ABI access everything as got-indirect.
19173 if (Subtarget.is32BitELFABI())
19174 return true;
19175
19176 // AIX accesses everything indirectly through the TOC, which is similar to
19177 // the GOT.
19178 if (Subtarget.isAIXABI())
19179 return true;
19180
19181 CodeModel::Model CModel = getTargetMachine().getCodeModel();
19182 // If it is small or large code model, module locals are accessed
19183 // indirectly by loading their address from .toc/.got.
19184 if (CModel == CodeModel::Small || CModel == CodeModel::Large)
19185 return true;
19186
19187 // JumpTable and BlockAddress are accessed as got-indirect.
19188 if (isa<JumpTableSDNode>(Val: GA) || isa<BlockAddressSDNode>(Val: GA))
19189 return true;
19190
19191 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: GA))
19192 return Subtarget.isGVIndirectSymbol(GV: G->getGlobal());
19193
19194 return false;
19195}
19196
19197bool
19198PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
19199 // The PowerPC target isn't yet aware of offsets.
19200 return false;
19201}
19202
19203void PPCTargetLowering::getTgtMemIntrinsic(
19204 SmallVectorImpl<IntrinsicInfo> &Infos, const CallBase &I,
19205 MachineFunction &MF, unsigned Intrinsic) const {
19206 IntrinsicInfo Info;
19207 switch (Intrinsic) {
19208 case Intrinsic::ppc_atomicrmw_xchg_i128:
19209 case Intrinsic::ppc_atomicrmw_add_i128:
19210 case Intrinsic::ppc_atomicrmw_sub_i128:
19211 case Intrinsic::ppc_atomicrmw_nand_i128:
19212 case Intrinsic::ppc_atomicrmw_and_i128:
19213 case Intrinsic::ppc_atomicrmw_or_i128:
19214 case Intrinsic::ppc_atomicrmw_xor_i128:
19215 case Intrinsic::ppc_cmpxchg_i128:
19216 Info.opc = ISD::INTRINSIC_W_CHAIN;
19217 Info.memVT = MVT::i128;
19218 Info.ptrVal = I.getArgOperand(i: 0);
19219 Info.offset = 0;
19220 Info.align = Align(16);
19221 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
19222 MachineMemOperand::MOVolatile;
19223 Infos.push_back(Elt: Info);
19224 return;
19225 case Intrinsic::ppc_atomic_load_i128:
19226 Info.opc = ISD::INTRINSIC_W_CHAIN;
19227 Info.memVT = MVT::i128;
19228 Info.ptrVal = I.getArgOperand(i: 0);
19229 Info.offset = 0;
19230 Info.align = Align(16);
19231 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
19232 Infos.push_back(Elt: Info);
19233 return;
19234 case Intrinsic::ppc_atomic_store_i128:
19235 Info.opc = ISD::INTRINSIC_VOID;
19236 Info.memVT = MVT::i128;
19237 Info.ptrVal = I.getArgOperand(i: 2);
19238 Info.offset = 0;
19239 Info.align = Align(16);
19240 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
19241 Infos.push_back(Elt: Info);
19242 return;
19243 case Intrinsic::ppc_altivec_lvx:
19244 case Intrinsic::ppc_altivec_lvxl:
19245 case Intrinsic::ppc_altivec_lvebx:
19246 case Intrinsic::ppc_altivec_lvehx:
19247 case Intrinsic::ppc_altivec_lvewx:
19248 case Intrinsic::ppc_vsx_lxvd2x:
19249 case Intrinsic::ppc_vsx_lxvw4x:
19250 case Intrinsic::ppc_vsx_lxvd2x_be:
19251 case Intrinsic::ppc_vsx_lxvw4x_be:
19252 case Intrinsic::ppc_vsx_lxvl:
19253 case Intrinsic::ppc_vsx_lxvll: {
19254 EVT VT;
19255 switch (Intrinsic) {
19256 case Intrinsic::ppc_altivec_lvebx:
19257 VT = MVT::i8;
19258 break;
19259 case Intrinsic::ppc_altivec_lvehx:
19260 VT = MVT::i16;
19261 break;
19262 case Intrinsic::ppc_altivec_lvewx:
19263 VT = MVT::i32;
19264 break;
19265 case Intrinsic::ppc_vsx_lxvd2x:
19266 case Intrinsic::ppc_vsx_lxvd2x_be:
19267 VT = MVT::v2f64;
19268 break;
19269 default:
19270 VT = MVT::v4i32;
19271 break;
19272 }
19273
19274 Info.opc = ISD::INTRINSIC_W_CHAIN;
19275 Info.memVT = VT;
19276 Info.ptrVal = I.getArgOperand(i: 0);
19277 Info.offset = -VT.getStoreSize()+1;
19278 Info.size = 2*VT.getStoreSize()-1;
19279 Info.align = Align(1);
19280 Info.flags = MachineMemOperand::MOLoad;
19281 Infos.push_back(Elt: Info);
19282 return;
19283 }
19284 case Intrinsic::ppc_altivec_stvx:
19285 case Intrinsic::ppc_altivec_stvxl:
19286 case Intrinsic::ppc_altivec_stvebx:
19287 case Intrinsic::ppc_altivec_stvehx:
19288 case Intrinsic::ppc_altivec_stvewx:
19289 case Intrinsic::ppc_vsx_stxvd2x:
19290 case Intrinsic::ppc_vsx_stxvw4x:
19291 case Intrinsic::ppc_vsx_stxvd2x_be:
19292 case Intrinsic::ppc_vsx_stxvw4x_be:
19293 case Intrinsic::ppc_vsx_stxvl:
19294 case Intrinsic::ppc_vsx_stxvll: {
19295 EVT VT;
19296 switch (Intrinsic) {
19297 case Intrinsic::ppc_altivec_stvebx:
19298 VT = MVT::i8;
19299 break;
19300 case Intrinsic::ppc_altivec_stvehx:
19301 VT = MVT::i16;
19302 break;
19303 case Intrinsic::ppc_altivec_stvewx:
19304 VT = MVT::i32;
19305 break;
19306 case Intrinsic::ppc_vsx_stxvd2x:
19307 case Intrinsic::ppc_vsx_stxvd2x_be:
19308 VT = MVT::v2f64;
19309 break;
19310 default:
19311 VT = MVT::v4i32;
19312 break;
19313 }
19314
19315 Info.opc = ISD::INTRINSIC_VOID;
19316 Info.memVT = VT;
19317 Info.ptrVal = I.getArgOperand(i: 1);
19318 Info.offset = -VT.getStoreSize()+1;
19319 Info.size = 2*VT.getStoreSize()-1;
19320 Info.align = Align(1);
19321 Info.flags = MachineMemOperand::MOStore;
19322 Infos.push_back(Elt: Info);
19323 return;
19324 }
19325 case Intrinsic::ppc_stdcx:
19326 case Intrinsic::ppc_stwcx:
19327 case Intrinsic::ppc_sthcx:
19328 case Intrinsic::ppc_stbcx: {
19329 EVT VT;
19330 auto Alignment = Align(8);
19331 switch (Intrinsic) {
19332 case Intrinsic::ppc_stdcx:
19333 VT = MVT::i64;
19334 break;
19335 case Intrinsic::ppc_stwcx:
19336 VT = MVT::i32;
19337 Alignment = Align(4);
19338 break;
19339 case Intrinsic::ppc_sthcx:
19340 VT = MVT::i16;
19341 Alignment = Align(2);
19342 break;
19343 case Intrinsic::ppc_stbcx:
19344 VT = MVT::i8;
19345 Alignment = Align(1);
19346 break;
19347 }
19348 Info.opc = ISD::INTRINSIC_W_CHAIN;
19349 Info.memVT = VT;
19350 Info.ptrVal = I.getArgOperand(i: 0);
19351 Info.offset = 0;
19352 Info.align = Alignment;
19353 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
19354 Infos.push_back(Elt: Info);
19355 return;
19356 }
19357 default:
19358 break;
19359 }
19360}
19361
19362/// It returns EVT::Other if the type should be determined using generic
19363/// target-independent logic.
19364EVT PPCTargetLowering::getOptimalMemOpType(
19365 LLVMContext &Context, const MemOp &Op,
19366 const AttributeList &FuncAttributes) const {
19367 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) {
19368 // We should use Altivec/VSX loads and stores when available. For unaligned
19369 // addresses, unaligned VSX loads are only fast starting with the P8.
19370 if (Subtarget.hasAltivec() && Op.size() >= 16) {
19371 if (Op.isMemset() && Subtarget.hasVSX()) {
19372 uint64_t TailSize = Op.size() % 16;
19373 // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
19374 // element if vector element type matches tail store. For tail size
19375 // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
19376 if (TailSize > 2 && TailSize <= 4) {
19377 return MVT::v8i16;
19378 }
19379 return MVT::v4i32;
19380 }
19381 if (Op.isAligned(AlignCheck: Align(16)) || Subtarget.hasP8Vector())
19382 return MVT::v4i32;
19383 }
19384 }
19385
19386 if (Subtarget.isPPC64()) {
19387 return MVT::i64;
19388 }
19389
19390 return MVT::i32;
19391}
19392
19393/// Returns true if it is beneficial to convert a load of a constant
19394/// to just the constant itself.
19395bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
19396 Type *Ty) const {
19397 assert(Ty->isIntegerTy());
19398
19399 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19400 return !(BitSize == 0 || BitSize > 64);
19401}
19402
19403bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
19404 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19405 return false;
19406 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
19407 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
19408 return NumBits1 == 64 && NumBits2 == 32;
19409}
19410
19411bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
19412 if (!VT1.isInteger() || !VT2.isInteger())
19413 return false;
19414 unsigned NumBits1 = VT1.getSizeInBits();
19415 unsigned NumBits2 = VT2.getSizeInBits();
19416 return NumBits1 == 64 && NumBits2 == 32;
19417}
19418
19419bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
19420 // Generally speaking, zexts are not free, but they are free when they can be
19421 // folded with other operations.
19422 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
19423 EVT MemVT = LD->getMemoryVT();
19424 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
19425 (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
19426 (LD->getExtensionType() == ISD::NON_EXTLOAD ||
19427 LD->getExtensionType() == ISD::ZEXTLOAD))
19428 return true;
19429 }
19430
19431 // FIXME: Add other cases...
19432 // - 32-bit shifts with a zext to i64
19433 // - zext after ctlz, bswap, etc.
19434 // - zext after and by a constant mask
19435
19436 return TargetLowering::isZExtFree(Val, VT2);
19437}
19438
19439bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
19440 assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
19441 "invalid fpext types");
19442 // Extending to float128 is not free.
19443 if (DestVT == MVT::f128)
19444 return false;
19445 return true;
19446}
19447
19448bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
19449 return isInt<16>(x: Imm) || isUInt<16>(x: Imm);
19450}
19451
19452bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
19453 return isInt<16>(x: Imm) || isUInt<16>(x: Imm);
19454}
19455
19456bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
19457 MachineMemOperand::Flags,
19458 unsigned *Fast) const {
19459 if (DisablePPCUnaligned)
19460 return false;
19461
19462 // PowerPC supports unaligned memory access for simple non-vector types.
19463 // Although accessing unaligned addresses is not as efficient as accessing
19464 // aligned addresses, it is generally more efficient than manual expansion,
19465 // and generally only traps for software emulation when crossing page
19466 // boundaries.
19467
19468 if (!VT.isSimple())
19469 return false;
19470
19471 if (VT.isFloatingPoint() && !VT.isVector() &&
19472 !Subtarget.allowsUnalignedFPAccess())
19473 return false;
19474
19475 if (VT.getSimpleVT().isVector()) {
19476 if (Subtarget.hasVSX()) {
19477 if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
19478 VT != MVT::v4f32 && VT != MVT::v4i32)
19479 return false;
19480 } else {
19481 return false;
19482 }
19483 }
19484
19485 if (VT == MVT::ppcf128)
19486 return false;
19487
19488 if (Fast)
19489 *Fast = 1;
19490
19491 return true;
19492}
19493
19494bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
19495 SDValue C) const {
19496 // Check integral scalar types.
19497 if (!VT.isScalarInteger())
19498 return false;
19499 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val: C.getNode())) {
19500 if (!ConstNode->getAPIntValue().isSignedIntN(N: 64))
19501 return false;
19502 // This transformation will generate >= 2 operations. But the following
19503 // cases will generate <= 2 instructions during ISEL. So exclude them.
19504 // 1. If the constant multiplier fits 16 bits, it can be handled by one
19505 // HW instruction, ie. MULLI
19506 // 2. If the multiplier after shifted fits 16 bits, an extra shift
19507 // instruction is needed than case 1, ie. MULLI and RLDICR
19508 int64_t Imm = ConstNode->getSExtValue();
19509 unsigned Shift = llvm::countr_zero<uint64_t>(Val: Imm);
19510 Imm >>= Shift;
19511 if (isInt<16>(x: Imm))
19512 return false;
19513 uint64_t UImm = static_cast<uint64_t>(Imm);
19514 if (isPowerOf2_64(Value: UImm + 1) || isPowerOf2_64(Value: UImm - 1) ||
19515 isPowerOf2_64(Value: 1 - UImm) || isPowerOf2_64(Value: -1 - UImm))
19516 return true;
19517 }
19518 return false;
19519}
19520
19521bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19522 EVT VT) const {
19523 return isFMAFasterThanFMulAndFAdd(
19524 F: MF.getFunction(), Ty: VT.getTypeForEVT(Context&: MF.getFunction().getContext()));
19525}
19526
19527bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
19528 Type *Ty) const {
19529 if (Subtarget.hasSPE() || Subtarget.useSoftFloat())
19530 return false;
19531 switch (Ty->getScalarType()->getTypeID()) {
19532 case Type::FloatTyID:
19533 case Type::DoubleTyID:
19534 return true;
19535 case Type::FP128TyID:
19536 return Subtarget.hasP9Vector();
19537 default:
19538 return false;
19539 }
19540}
19541
19542// FIXME: add more patterns which are not profitable to hoist.
19543bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
19544 if (!I->hasOneUse())
19545 return true;
19546
19547 Instruction *User = I->user_back();
19548 assert(User && "A single use instruction with no uses.");
19549
19550 switch (I->getOpcode()) {
19551 case Instruction::FMul: {
19552 // Don't break FMA, PowerPC prefers FMA.
19553 if (User->getOpcode() != Instruction::FSub &&
19554 User->getOpcode() != Instruction::FAdd)
19555 return true;
19556
19557 const TargetOptions &Options = getTargetMachine().Options;
19558 const Function *F = I->getFunction();
19559 const DataLayout &DL = F->getDataLayout();
19560 Type *Ty = User->getOperand(i: 0)->getType();
19561 bool AllowContract = I->getFastMathFlags().allowContract() &&
19562 User->getFastMathFlags().allowContract();
19563
19564 return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
19565 isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
19566 (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
19567 }
19568 case Instruction::Load: {
19569 // Don't break "store (load float*)" pattern, this pattern will be combined
19570 // to "store (load int32)" in later InstCombine pass. See function
19571 // combineLoadToOperationType. On PowerPC, loading a float point takes more
19572 // cycles than loading a 32 bit integer.
19573 LoadInst *LI = cast<LoadInst>(Val: I);
19574 // For the loads that combineLoadToOperationType does nothing, like
19575 // ordered load, it should be profitable to hoist them.
19576 // For swifterror load, it can only be used for pointer to pointer type, so
19577 // later type check should get rid of this case.
19578 if (!LI->isUnordered())
19579 return true;
19580
19581 if (User->getOpcode() != Instruction::Store)
19582 return true;
19583
19584 if (I->getType()->getTypeID() != Type::FloatTyID)
19585 return true;
19586
19587 return false;
19588 }
19589 default:
19590 return true;
19591 }
19592 return true;
19593}
19594
19595const MCPhysReg *
19596PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
19597 // LR is a callee-save register, but we must treat it as clobbered by any call
19598 // site. Hence we include LR in the scratch registers, which are in turn added
19599 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
19600 // to CTR, which is used by any indirect call.
19601 static const MCPhysReg ScratchRegs[] = {
19602 PPC::X12, PPC::LR8, PPC::CTR8, 0
19603 };
19604
19605 return ScratchRegs;
19606}
19607
19608Register PPCTargetLowering::getExceptionPointerRegister(
19609 const Constant *PersonalityFn) const {
19610 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
19611}
19612
19613Register PPCTargetLowering::getExceptionSelectorRegister(
19614 const Constant *PersonalityFn) const {
19615 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
19616}
19617
19618bool
19619PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
19620 EVT VT , unsigned DefinedValues) const {
19621 if (VT == MVT::v2i64)
19622 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
19623
19624 if (Subtarget.hasVSX())
19625 return true;
19626
19627 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
19628}
19629
19630Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
19631 if (DisableILPPref || Subtarget.enableMachineScheduler())
19632 return TargetLowering::getSchedulingPreference(N);
19633
19634 return Sched::ILP;
19635}
19636
19637// Create a fast isel object.
19638FastISel *PPCTargetLowering::createFastISel(
19639 FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo,
19640 const LibcallLoweringInfo *LibcallLowering) const {
19641 return PPC::createFastISel(FuncInfo, LibInfo, LibcallLowering);
19642}
19643
19644// 'Inverted' means the FMA opcode after negating one multiplicand.
19645// For example, (fma -a b c) = (fnmsub a b c)
19646static unsigned invertFMAOpcode(unsigned Opc) {
19647 switch (Opc) {
19648 default:
19649 llvm_unreachable("Invalid FMA opcode for PowerPC!");
19650 case ISD::FMA:
19651 return PPCISD::FNMSUB;
19652 case PPCISD::FNMSUB:
19653 return ISD::FMA;
19654 }
19655}
19656
19657SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
19658 bool LegalOps, bool OptForSize,
19659 NegatibleCost &Cost,
19660 unsigned Depth) const {
19661 if (Depth > SelectionDAG::MaxRecursionDepth)
19662 return SDValue();
19663
19664 unsigned Opc = Op.getOpcode();
19665 EVT VT = Op.getValueType();
19666 SDNodeFlags Flags = Op.getNode()->getFlags();
19667
19668 switch (Opc) {
19669 case PPCISD::FNMSUB:
19670 if (!Op.hasOneUse() || !isTypeLegal(VT))
19671 break;
19672
19673 SDValue N0 = Op.getOperand(i: 0);
19674 SDValue N1 = Op.getOperand(i: 1);
19675 SDValue N2 = Op.getOperand(i: 2);
19676 SDLoc Loc(Op);
19677
19678 NegatibleCost N2Cost = NegatibleCost::Expensive;
19679 SDValue NegN2 =
19680 getNegatedExpression(Op: N2, DAG, LegalOps, OptForSize, Cost&: N2Cost, Depth: Depth + 1);
19681
19682 if (!NegN2)
19683 return SDValue();
19684
19685 // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
19686 // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
19687 // These transformations may change sign of zeroes. For example,
19688 // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
19689 if (Flags.hasNoSignedZeros()) {
19690 // Try and choose the cheaper one to negate.
19691 NegatibleCost N0Cost = NegatibleCost::Expensive;
19692 SDValue NegN0 = getNegatedExpression(Op: N0, DAG, LegalOps, OptForSize,
19693 Cost&: N0Cost, Depth: Depth + 1);
19694
19695 NegatibleCost N1Cost = NegatibleCost::Expensive;
19696 SDValue NegN1 = getNegatedExpression(Op: N1, DAG, LegalOps, OptForSize,
19697 Cost&: N1Cost, Depth: Depth + 1);
19698
19699 if (NegN0 && N0Cost <= N1Cost) {
19700 Cost = std::min(a: N0Cost, b: N2Cost);
19701 return DAG.getNode(Opcode: Opc, DL: Loc, VT, N1: NegN0, N2: N1, N3: NegN2, Flags);
19702 } else if (NegN1) {
19703 Cost = std::min(a: N1Cost, b: N2Cost);
19704 return DAG.getNode(Opcode: Opc, DL: Loc, VT, N1: N0, N2: NegN1, N3: NegN2, Flags);
19705 }
19706 }
19707
19708 // (fneg (fnmsub a b c)) => (fma a b (fneg c))
19709 if (isOperationLegal(Op: ISD::FMA, VT)) {
19710 Cost = N2Cost;
19711 return DAG.getNode(Opcode: ISD::FMA, DL: Loc, VT, N1: N0, N2: N1, N3: NegN2, Flags);
19712 }
19713
19714 break;
19715 }
19716
19717 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
19718 Cost, Depth);
19719}
19720
19721// Override to enable LOAD_STACK_GUARD lowering on Linux.
19722bool PPCTargetLowering::useLoadStackGuardNode(const Module &M) const {
19723 if (M.getStackProtectorGuard() == "tls" || Subtarget.isTargetLinux())
19724 return true;
19725 return TargetLowering::useLoadStackGuardNode(M);
19726}
19727
19728bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
19729 bool ForCodeSize) const {
19730 if (!VT.isSimple() || !Subtarget.hasVSX())
19731 return false;
19732
19733 switch(VT.getSimpleVT().SimpleTy) {
19734 default:
19735 // For FP types that are currently not supported by PPC backend, return
19736 // false. Examples: f16, f80.
19737 return false;
19738 case MVT::f32:
19739 case MVT::f64: {
19740 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
19741 // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
19742 return true;
19743 }
19744 bool IsExact;
19745 APSInt IntResult(16, false);
19746 // The rounding mode doesn't really matter because we only care about floats
19747 // that can be converted to integers exactly.
19748 Imm.convertToInteger(Result&: IntResult, RM: APFloat::rmTowardZero, IsExact: &IsExact);
19749 // For exact values in the range [-16, 15] we can materialize the float.
19750 if (IsExact && IntResult <= 15 && IntResult >= -16)
19751 return true;
19752 return Imm.isZero();
19753 }
19754 case MVT::ppcf128:
19755 return Imm.isPosZero();
19756 }
19757}
19758
19759// For vector shift operation op, fold
19760// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
19761static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
19762 SelectionDAG &DAG) {
19763 SDValue N0 = N->getOperand(Num: 0);
19764 SDValue N1 = N->getOperand(Num: 1);
19765 EVT VT = N0.getValueType();
19766 unsigned OpSizeInBits = VT.getScalarSizeInBits();
19767 unsigned Opcode = N->getOpcode();
19768 unsigned TargetOpcode;
19769
19770 switch (Opcode) {
19771 default:
19772 llvm_unreachable("Unexpected shift operation");
19773 case ISD::SHL:
19774 TargetOpcode = PPCISD::SHL;
19775 break;
19776 case ISD::SRL:
19777 TargetOpcode = PPCISD::SRL;
19778 break;
19779 case ISD::SRA:
19780 TargetOpcode = PPCISD::SRA;
19781 break;
19782 }
19783
19784 if (VT.isVector() && TLI.isOperationLegal(Op: Opcode, VT) &&
19785 N1->getOpcode() == ISD::AND)
19786 if (ConstantSDNode *Mask = isConstOrConstSplat(N: N1->getOperand(Num: 1)))
19787 if (Mask->getZExtValue() == OpSizeInBits - 1)
19788 return DAG.getNode(Opcode: TargetOpcode, DL: SDLoc(N), VT, N1: N0, N2: N1->getOperand(Num: 0));
19789
19790 return SDValue();
19791}
19792
19793SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
19794 DAGCombinerInfo &DCI) const {
19795 EVT VT = N->getValueType(ResNo: 0);
19796 assert(VT.isVector() && "Vector type expected.");
19797
19798 unsigned Opc = N->getOpcode();
19799 assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
19800 "Unexpected opcode.");
19801
19802 if (!isOperationLegal(Op: Opc, VT))
19803 return SDValue();
19804
19805 EVT EltTy = VT.getScalarType();
19806 unsigned EltBits = EltTy.getSizeInBits();
19807 if (EltTy != MVT::i64 && EltTy != MVT::i32)
19808 return SDValue();
19809
19810 SDValue N1 = N->getOperand(Num: 1);
19811 uint64_t SplatBits = 0;
19812 bool AddSplatCase = false;
19813 unsigned OpcN1 = N1.getOpcode();
19814 if (OpcN1 == PPCISD::VADD_SPLAT &&
19815 N1.getConstantOperandVal(i: 1) == VT.getVectorNumElements()) {
19816 AddSplatCase = true;
19817 SplatBits = N1.getConstantOperandVal(i: 0);
19818 }
19819
19820 if (!AddSplatCase) {
19821 if (OpcN1 != ISD::BUILD_VECTOR)
19822 return SDValue();
19823
19824 unsigned SplatBitSize;
19825 bool HasAnyUndefs;
19826 APInt APSplatBits, APSplatUndef;
19827 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val&: N1);
19828 bool BVNIsConstantSplat =
19829 BVN->isConstantSplat(SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize,
19830 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian());
19831 if (!BVNIsConstantSplat || SplatBitSize != EltBits)
19832 return SDValue();
19833 SplatBits = APSplatBits.getZExtValue();
19834 }
19835
19836 SDLoc DL(N);
19837 SDValue N0 = N->getOperand(Num: 0);
19838 // PPC vector shifts by word/double look at only the low 5/6 bits of the
19839 // shift vector, which means the max value is 31/63. A shift vector of all
19840 // 1s will be truncated to 31/63, which is useful as vspltiw is limited to
19841 // -16 to 15 range.
19842 if (SplatBits == (EltBits - 1)) {
19843 unsigned NewOpc;
19844 switch (Opc) {
19845 case ISD::SHL:
19846 NewOpc = PPCISD::SHL;
19847 break;
19848 case ISD::SRL:
19849 NewOpc = PPCISD::SRL;
19850 break;
19851 case ISD::SRA:
19852 NewOpc = PPCISD::SRA;
19853 break;
19854 }
19855 SDValue SplatOnes = getCanonicalConstSplat(Val: 255, SplatSize: 1, VT, DAG&: DCI.DAG, dl: DL);
19856 return DCI.DAG.getNode(Opcode: NewOpc, DL, VT, N1: N0, N2: SplatOnes);
19857 }
19858
19859 if (Opc != ISD::SHL || !isOperationLegal(Op: ISD::ADD, VT))
19860 return SDValue();
19861
19862 // For 64-bit there is no splat immediate so we want to catch shift by 1 here
19863 // before the BUILD_VECTOR is replaced by a load.
19864 if (EltTy != MVT::i64 || SplatBits != 1)
19865 return SDValue();
19866
19867 return DCI.DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N), VT, N1: N0, N2: N0);
19868}
19869
19870SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
19871 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19872 return Value;
19873
19874 if (N->getValueType(ResNo: 0).isVector())
19875 return combineVectorShift(N, DCI);
19876
19877 SDValue N0 = N->getOperand(Num: 0);
19878 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
19879 if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
19880 N0.getOpcode() != ISD::SIGN_EXTEND ||
19881 N0.getOperand(i: 0).getValueType() != MVT::i32 || CN1 == nullptr ||
19882 N->getValueType(ResNo: 0) != MVT::i64)
19883 return SDValue();
19884
19885 // We can't save an operation here if the value is already extended, and
19886 // the existing shift is easier to combine.
19887 SDValue ExtsSrc = N0.getOperand(i: 0);
19888 if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
19889 ExtsSrc.getOperand(i: 0).getOpcode() == ISD::AssertSext)
19890 return SDValue();
19891
19892 SDLoc DL(N0);
19893 SDValue ShiftBy = SDValue(CN1, 0);
19894 // We want the shift amount to be i32 on the extswli, but the shift could
19895 // have an i64.
19896 if (ShiftBy.getValueType() == MVT::i64)
19897 ShiftBy = DCI.DAG.getConstant(Val: CN1->getZExtValue(), DL, VT: MVT::i32);
19898
19899 return DCI.DAG.getNode(Opcode: PPCISD::EXTSWSLI, DL, VT: MVT::i64, N1: N0->getOperand(Num: 0),
19900 N2: ShiftBy);
19901}
19902
19903SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
19904 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19905 return Value;
19906
19907 if (N->getValueType(ResNo: 0).isVector())
19908 return combineVectorShift(N, DCI);
19909
19910 return SDValue();
19911}
19912
19913SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
19914 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19915 return Value;
19916
19917 if (N->getValueType(ResNo: 0).isVector())
19918 return combineVectorShift(N, DCI);
19919
19920 return SDValue();
19921}
19922
19923// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
19924// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
19925// When C is zero, the equation (addi Z, -C) can be simplified to Z
19926// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
19927static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
19928 const PPCSubtarget &Subtarget) {
19929 if (!Subtarget.isPPC64())
19930 return SDValue();
19931
19932 SDValue LHS = N->getOperand(Num: 0);
19933 SDValue RHS = N->getOperand(Num: 1);
19934
19935 auto isZextOfCompareWithConstant = [](SDValue Op) {
19936 if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
19937 Op.getValueType() != MVT::i64)
19938 return false;
19939
19940 SDValue Cmp = Op.getOperand(i: 0);
19941 if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
19942 Cmp.getOperand(i: 0).getValueType() != MVT::i64)
19943 return false;
19944
19945 if (auto *Constant = dyn_cast<ConstantSDNode>(Val: Cmp.getOperand(i: 1))) {
19946 int64_t NegConstant = 0 - Constant->getSExtValue();
19947 // Due to the limitations of the addi instruction,
19948 // -C is required to be [-32768, 32767].
19949 return isInt<16>(x: NegConstant);
19950 }
19951
19952 return false;
19953 };
19954
19955 bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
19956 bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
19957
19958 // If there is a pattern, canonicalize a zext operand to the RHS.
19959 if (LHSHasPattern && !RHSHasPattern)
19960 std::swap(a&: LHS, b&: RHS);
19961 else if (!LHSHasPattern && !RHSHasPattern)
19962 return SDValue();
19963
19964 SDLoc DL(N);
19965 EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
19966 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: CarryType);
19967 SDValue Cmp = RHS.getOperand(i: 0);
19968 SDValue Z = Cmp.getOperand(i: 0);
19969 auto *Constant = cast<ConstantSDNode>(Val: Cmp.getOperand(i: 1));
19970 int64_t NegConstant = 0 - Constant->getSExtValue();
19971
19972 switch(cast<CondCodeSDNode>(Val: Cmp.getOperand(i: 2))->get()) {
19973 default: break;
19974 case ISD::SETNE: {
19975 // when C == 0
19976 // --> addze X, (addic Z, -1).carry
19977 // /
19978 // add X, (zext(setne Z, C))--
19979 // \ when -32768 <= -C <= 32767 && C != 0
19980 // --> addze X, (addic (addi Z, -C), -1).carry
19981 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Z,
19982 N2: DAG.getConstant(Val: NegConstant, DL, VT: MVT::i64));
19983 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
19984 SDValue Addc =
19985 DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: CarryType),
19986 N1: AddOrZ, N2: DAG.getAllOnesConstant(DL, VT: MVT::i64),
19987 N3: DAG.getConstant(Val: 0, DL, VT: CarryType));
19988 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: VTs, N1: LHS,
19989 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64),
19990 N3: SDValue(Addc.getNode(), 1));
19991 }
19992 case ISD::SETEQ: {
19993 // when C == 0
19994 // --> addze X, (subfic Z, 0).carry
19995 // /
19996 // add X, (zext(sete Z, C))--
19997 // \ when -32768 <= -C <= 32767 && C != 0
19998 // --> addze X, (subfic (addi Z, -C), 0).carry
19999 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Z,
20000 N2: DAG.getConstant(Val: NegConstant, DL, VT: MVT::i64));
20001 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
20002 SDValue Subc =
20003 DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: CarryType),
20004 N1: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N2: AddOrZ,
20005 N3: DAG.getConstant(Val: 0, DL, VT: CarryType));
20006 SDValue Invert = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryType, N1: Subc.getValue(R: 1),
20007 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryType));
20008 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: VTs, N1: LHS,
20009 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N3: Invert);
20010 }
20011 }
20012
20013 return SDValue();
20014}
20015
20016// Transform
20017// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
20018// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
20019// In this case both C1 and C2 must be known constants.
20020// C1+C2 must fit into a 34 bit signed integer.
20021static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
20022 const PPCSubtarget &Subtarget) {
20023 if (!Subtarget.isUsingPCRelativeCalls())
20024 return SDValue();
20025
20026 // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
20027 // If we find that node try to cast the Global Address and the Constant.
20028 SDValue LHS = N->getOperand(Num: 0);
20029 SDValue RHS = N->getOperand(Num: 1);
20030
20031 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
20032 std::swap(a&: LHS, b&: RHS);
20033
20034 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
20035 return SDValue();
20036
20037 // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
20038 GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(Val: LHS.getOperand(i: 0));
20039 ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(Val&: RHS);
20040
20041 // Check that both casts succeeded.
20042 if (!GSDN || !ConstNode)
20043 return SDValue();
20044
20045 int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
20046 SDLoc DL(GSDN);
20047
20048 // The signed int offset needs to fit in 34 bits.
20049 if (!isInt<34>(x: NewOffset))
20050 return SDValue();
20051
20052 // The new global address is a copy of the old global address except
20053 // that it has the updated Offset.
20054 SDValue GA =
20055 DAG.getTargetGlobalAddress(GV: GSDN->getGlobal(), DL, VT: GSDN->getValueType(ResNo: 0),
20056 offset: NewOffset, TargetFlags: GSDN->getTargetFlags());
20057 SDValue MatPCRel =
20058 DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: GSDN->getValueType(ResNo: 0), Operand: GA);
20059 return MatPCRel;
20060}
20061
20062// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes))
20063// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv
20064// Mathematical identity: X + 1 = X - (-1)
20065// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1
20066// Requirement: VSX feature for efficient xxleqv generation
20067static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
20068 const PPCSubtarget &Subtarget) {
20069
20070 EVT VT = N->getValueType(ResNo: 0);
20071 if (!Subtarget.hasVSX())
20072 return SDValue();
20073
20074 // Handle v2i64, v4i32, v8i16 and v16i8 types
20075 if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
20076 VT == MVT::v2i64))
20077 return SDValue();
20078
20079 SDValue LHS = N->getOperand(Num: 0);
20080 SDValue RHS = N->getOperand(Num: 1);
20081
20082 // Check if RHS is BUILD_VECTOR
20083 if (RHS.getOpcode() != ISD::BUILD_VECTOR)
20084 return SDValue();
20085
20086 // Check if all the elements are 1
20087 unsigned NumOfEles = RHS.getNumOperands();
20088 for (unsigned i = 0; i < NumOfEles; ++i) {
20089 auto *CN = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i));
20090 if (!CN || CN->getSExtValue() != 1)
20091 return SDValue();
20092 }
20093 SDLoc DL(N);
20094
20095 SDValue MinusOne = DAG.getConstant(Val: APInt::getAllOnes(numBits: 32), DL, VT: MVT::i32);
20096 SmallVector<SDValue, 4> Ops(4, MinusOne);
20097 SDValue AllOnesVec = DAG.getBuildVector(VT: MVT::v4i32, DL, Ops);
20098
20099 // Bitcast to the target vector type
20100 SDValue Bitcast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: AllOnesVec);
20101
20102 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Bitcast);
20103}
20104
20105SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
20106 if (auto Value = combineADDToADDZE(N, DAG&: DCI.DAG, Subtarget))
20107 return Value;
20108
20109 if (auto Value = combineADDToMAT_PCREL_ADDR(N, DAG&: DCI.DAG, Subtarget))
20110 return Value;
20111
20112 if (auto Value = combineADDToSUB(N, DAG&: DCI.DAG, Subtarget))
20113 return Value;
20114 return SDValue();
20115}
20116
20117// Detect TRUNCATE operations on bitcasts of float128 values.
20118// What we are looking for here is the situtation where we extract a subset
20119// of bits from a 128 bit float.
20120// This can be of two forms:
20121// 1) BITCAST of f128 feeding TRUNCATE
20122// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
20123// The reason this is required is because we do not have a legal i128 type
20124// and so we want to prevent having to store the f128 and then reload part
20125// of it.
20126SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
20127 DAGCombinerInfo &DCI) const {
20128 // If we are using CRBits then try that first.
20129 if (Subtarget.useCRBits()) {
20130 // Check if CRBits did anything and return that if it did.
20131 if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
20132 return CRTruncValue;
20133 }
20134
20135 SDLoc dl(N);
20136 SDValue Op0 = N->getOperand(Num: 0);
20137
20138 // Looking for a truncate of i128 to i64.
20139 if (Op0.getValueType() != MVT::i128 || N->getValueType(ResNo: 0) != MVT::i64)
20140 return SDValue();
20141
20142 int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
20143
20144 // SRL feeding TRUNCATE.
20145 if (Op0.getOpcode() == ISD::SRL) {
20146 ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Val: Op0.getOperand(i: 1));
20147 // The right shift has to be by 64 bits.
20148 if (!ConstNode || ConstNode->getZExtValue() != 64)
20149 return SDValue();
20150
20151 // Switch the element number to extract.
20152 EltToExtract = EltToExtract ? 0 : 1;
20153 // Update Op0 past the SRL.
20154 Op0 = Op0.getOperand(i: 0);
20155 }
20156
20157 // BITCAST feeding a TRUNCATE possibly via SRL.
20158 if (Op0.getOpcode() == ISD::BITCAST &&
20159 Op0.getValueType() == MVT::i128 &&
20160 Op0.getOperand(i: 0).getValueType() == MVT::f128) {
20161 SDValue Bitcast = DCI.DAG.getBitcast(VT: MVT::v2i64, V: Op0.getOperand(i: 0));
20162 return DCI.DAG.getNode(
20163 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Bitcast,
20164 N2: DCI.DAG.getTargetConstant(Val: EltToExtract, DL: dl, VT: MVT::i32));
20165 }
20166 return SDValue();
20167}
20168
20169SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
20170 SelectionDAG &DAG = DCI.DAG;
20171
20172 ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N: N->getOperand(Num: 1));
20173 if (!ConstOpOrElement)
20174 return SDValue();
20175
20176 // An imul is usually smaller than the alternative sequence for legal type.
20177 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
20178 isOperationLegal(Op: ISD::MUL, VT: N->getValueType(ResNo: 0)))
20179 return SDValue();
20180
20181 auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
20182 switch (this->Subtarget.getCPUDirective()) {
20183 default:
20184 // TODO: enhance the condition for subtarget before pwr8
20185 return false;
20186 case PPC::DIR_PWR8:
20187 // type mul add shl
20188 // scalar 4 1 1
20189 // vector 7 2 2
20190 return true;
20191 case PPC::DIR_PWR9:
20192 case PPC::DIR_PWR10:
20193 case PPC::DIR_PWR11:
20194 case PPC::DIR_PWR_FUTURE:
20195 // type mul add shl
20196 // scalar 5 2 2
20197 // vector 7 2 2
20198
20199 // The cycle RATIO of related operations are showed as a table above.
20200 // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
20201 // scalar and vector type. For 2 instrs patterns, add/sub + shl
20202 // are 4, it is always profitable; but for 3 instrs patterns
20203 // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
20204 // So we should only do it for vector type.
20205 return IsAddOne && IsNeg ? VT.isVector() : true;
20206 }
20207 };
20208
20209 EVT VT = N->getValueType(ResNo: 0);
20210 SDLoc DL(N);
20211
20212 const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
20213 bool IsNeg = MulAmt.isNegative();
20214 APInt MulAmtAbs = MulAmt.abs();
20215
20216 if ((MulAmtAbs - 1).isPowerOf2()) {
20217 // (mul x, 2^N + 1) => (add (shl x, N), x)
20218 // (mul x, -(2^N + 1)) => -(add (shl x, N), x)
20219
20220 if (!IsProfitable(IsNeg, true, VT))
20221 return SDValue();
20222
20223 SDValue Op0 = N->getOperand(Num: 0);
20224 SDValue Op1 =
20225 DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N->getOperand(Num: 0),
20226 N2: DAG.getConstant(Val: (MulAmtAbs - 1).logBase2(), DL, VT));
20227 SDValue Res = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Op0, N2: Op1);
20228
20229 if (!IsNeg)
20230 return Res;
20231
20232 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Res);
20233 } else if ((MulAmtAbs + 1).isPowerOf2()) {
20234 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20235 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
20236
20237 if (!IsProfitable(IsNeg, false, VT))
20238 return SDValue();
20239
20240 SDValue Op0 = N->getOperand(Num: 0);
20241 SDValue Op1 =
20242 DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N->getOperand(Num: 0),
20243 N2: DAG.getConstant(Val: (MulAmtAbs + 1).logBase2(), DL, VT));
20244
20245 if (!IsNeg)
20246 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Op1, N2: Op0);
20247 else
20248 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Op0, N2: Op1);
20249
20250 } else {
20251 return SDValue();
20252 }
20253}
20254
20255// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
20256// in combiner since we need to check SD flags and other subtarget features.
20257SDValue PPCTargetLowering::combineFMALike(SDNode *N,
20258 DAGCombinerInfo &DCI) const {
20259 SDValue N0 = N->getOperand(Num: 0);
20260 SDValue N1 = N->getOperand(Num: 1);
20261 SDValue N2 = N->getOperand(Num: 2);
20262 SDNodeFlags Flags = N->getFlags();
20263 EVT VT = N->getValueType(ResNo: 0);
20264 SelectionDAG &DAG = DCI.DAG;
20265 unsigned Opc = N->getOpcode();
20266 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
20267 bool LegalOps = !DCI.isBeforeLegalizeOps();
20268 SDLoc Loc(N);
20269
20270 if (!isOperationLegal(Op: ISD::FMA, VT))
20271 return SDValue();
20272
20273 // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
20274 // since (fnmsub a b c)=-0 while c-ab=+0.
20275 if (!Flags.hasNoSignedZeros())
20276 return SDValue();
20277
20278 // (fma (fneg a) b c) => (fnmsub a b c)
20279 // (fnmsub (fneg a) b c) => (fma a b c)
20280 if (SDValue NegN0 = getCheaperNegatedExpression(Op: N0, DAG, LegalOps, OptForSize: CodeSize))
20281 return DAG.getNode(Opcode: invertFMAOpcode(Opc), DL: Loc, VT, N1: NegN0, N2: N1, N3: N2, Flags);
20282
20283 // (fma a (fneg b) c) => (fnmsub a b c)
20284 // (fnmsub a (fneg b) c) => (fma a b c)
20285 if (SDValue NegN1 = getCheaperNegatedExpression(Op: N1, DAG, LegalOps, OptForSize: CodeSize))
20286 return DAG.getNode(Opcode: invertFMAOpcode(Opc), DL: Loc, VT, N1: N0, N2: NegN1, N3: N2, Flags);
20287
20288 return SDValue();
20289}
20290
20291bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
20292 // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
20293 if (!Subtarget.is64BitELFABI())
20294 return false;
20295
20296 // If not a tail call then no need to proceed.
20297 if (!CI->isTailCall())
20298 return false;
20299
20300 // If sibling calls have been disabled and tail-calls aren't guaranteed
20301 // there is no reason to duplicate.
20302 auto &TM = getTargetMachine();
20303 if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
20304 return false;
20305
20306 // Can't tail call a function called indirectly, or if it has variadic args.
20307 const Function *Callee = CI->getCalledFunction();
20308 if (!Callee || Callee->isVarArg())
20309 return false;
20310
20311 // Make sure the callee and caller calling conventions are eligible for tco.
20312 const Function *Caller = CI->getParent()->getParent();
20313 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC: Caller->getCallingConv(),
20314 CalleeCC: CI->getCallingConv()))
20315 return false;
20316
20317 // If the function is local then we have a good chance at tail-calling it
20318 return getTargetMachine().shouldAssumeDSOLocal(GV: Callee);
20319}
20320
20321bool PPCTargetLowering::
20322isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
20323 const Value *Mask = AndI.getOperand(i: 1);
20324 // If the mask is suitable for andi. or andis. we should sink the and.
20325 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val: Mask)) {
20326 // Can't handle constants wider than 64-bits.
20327 if (CI->getBitWidth() > 64)
20328 return false;
20329 int64_t ConstVal = CI->getZExtValue();
20330 return isUInt<16>(x: ConstVal) ||
20331 (isUInt<16>(x: ConstVal >> 16) && !(ConstVal & 0xFFFF));
20332 }
20333
20334 // For non-constant masks, we can always use the record-form and.
20335 return true;
20336}
20337
20338/// getAddrModeForFlags - Based on the set of address flags, select the most
20339/// optimal instruction format to match by.
20340PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
20341 // This is not a node we should be handling here.
20342 if (Flags == PPC::MOF_None)
20343 return PPC::AM_None;
20344 // Unaligned D-Forms are tried first, followed by the aligned D-Forms.
20345 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DForm))
20346 if ((Flags & FlagSet) == FlagSet)
20347 return PPC::AM_DForm;
20348 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DSForm))
20349 if ((Flags & FlagSet) == FlagSet)
20350 return PPC::AM_DSForm;
20351 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DQForm))
20352 if ((Flags & FlagSet) == FlagSet)
20353 return PPC::AM_DQForm;
20354 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_PrefixDForm))
20355 if ((Flags & FlagSet) == FlagSet)
20356 return PPC::AM_PrefixDForm;
20357 // If no other forms are selected, return an X-Form as it is the most
20358 // general addressing mode.
20359 return PPC::AM_XForm;
20360}
20361
20362/// Set alignment flags based on whether or not the Frame Index is aligned.
20363/// Utilized when computing flags for address computation when selecting
20364/// load and store instructions.
20365static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,
20366 SelectionDAG &DAG) {
20367 bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));
20368 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: IsAdd ? N.getOperand(i: 0) : N);
20369 if (!FI)
20370 return;
20371 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20372 unsigned FrameIndexAlign = MFI.getObjectAlign(ObjectIdx: FI->getIndex()).value();
20373 // If this is (add $FI, $S16Imm), the alignment flags are already set
20374 // based on the immediate. We just need to clear the alignment flags
20375 // if the FI alignment is weaker.
20376 if ((FrameIndexAlign % 4) != 0)
20377 FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;
20378 if ((FrameIndexAlign % 16) != 0)
20379 FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;
20380 // If the address is a plain FrameIndex, set alignment flags based on
20381 // FI alignment.
20382 if (!IsAdd) {
20383 if ((FrameIndexAlign % 4) == 0)
20384 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
20385 if ((FrameIndexAlign % 16) == 0)
20386 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
20387 }
20388}
20389
20390/// Given a node, compute flags that are used for address computation when
20391/// selecting load and store instructions. The flags computed are stored in
20392/// FlagSet. This function takes into account whether the node is a constant,
20393/// an ADD, OR, or a constant, and computes the address flags accordingly.
20394static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
20395 SelectionDAG &DAG) {
20396 // Set the alignment flags for the node depending on if the node is
20397 // 4-byte or 16-byte aligned.
20398 auto SetAlignFlagsForImm = [&](uint64_t Imm) {
20399 if ((Imm & 0x3) == 0)
20400 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
20401 if ((Imm & 0xf) == 0)
20402 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
20403 };
20404
20405 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: N)) {
20406 // All 32-bit constants can be computed as LIS + Disp.
20407 const APInt &ConstImm = CN->getAPIntValue();
20408 if (ConstImm.isSignedIntN(N: 32)) { // Flag to handle 32-bit constants.
20409 FlagSet |= PPC::MOF_AddrIsSImm32;
20410 SetAlignFlagsForImm(ConstImm.getZExtValue());
20411 setAlignFlagsForFI(N, FlagSet, DAG);
20412 }
20413 if (ConstImm.isSignedIntN(N: 34)) // Flag to handle 34-bit constants.
20414 FlagSet |= PPC::MOF_RPlusSImm34;
20415 else // Let constant materialization handle large constants.
20416 FlagSet |= PPC::MOF_NotAddNorCst;
20417 } else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {
20418 // This address can be represented as an addition of:
20419 // - Register + Imm16 (possibly a multiple of 4/16)
20420 // - Register + Imm34
20421 // - Register + PPCISD::Lo
20422 // - Register + Register
20423 // In any case, we won't have to match this as Base + Zero.
20424 SDValue RHS = N.getOperand(i: 1);
20425 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: RHS)) {
20426 const APInt &ConstImm = CN->getAPIntValue();
20427 if (ConstImm.isSignedIntN(N: 16)) {
20428 FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.
20429 SetAlignFlagsForImm(ConstImm.getZExtValue());
20430 setAlignFlagsForFI(N, FlagSet, DAG);
20431 }
20432 if (ConstImm.isSignedIntN(N: 34))
20433 FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
20434 else
20435 FlagSet |= PPC::MOF_RPlusR; // Register.
20436 } else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(i: 1))
20437 FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
20438 else
20439 FlagSet |= PPC::MOF_RPlusR;
20440 } else { // The address computation is not a constant or an addition.
20441 setAlignFlagsForFI(N, FlagSet, DAG);
20442 FlagSet |= PPC::MOF_NotAddNorCst;
20443 }
20444}
20445
20446static bool isPCRelNode(SDValue N) {
20447 return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||
20448 isValidPCRelNode<ConstantPoolSDNode>(N) ||
20449 isValidPCRelNode<GlobalAddressSDNode>(N) ||
20450 isValidPCRelNode<JumpTableSDNode>(N) ||
20451 isValidPCRelNode<BlockAddressSDNode>(N));
20452}
20453
20454/// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
20455/// the address flags of the load/store instruction that is to be matched.
20456unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
20457 SelectionDAG &DAG) const {
20458 unsigned FlagSet = PPC::MOF_None;
20459
20460 // Compute subtarget flags.
20461 if (!Subtarget.hasP9Vector())
20462 FlagSet |= PPC::MOF_SubtargetBeforeP9;
20463 else
20464 FlagSet |= PPC::MOF_SubtargetP9;
20465
20466 if (Subtarget.hasPrefixInstrs())
20467 FlagSet |= PPC::MOF_SubtargetP10;
20468
20469 if (Subtarget.hasSPE())
20470 FlagSet |= PPC::MOF_SubtargetSPE;
20471
20472 // Check if we have a PCRel node and return early.
20473 if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))
20474 return FlagSet;
20475
20476 // If the node is the paired load/store intrinsics, compute flags for
20477 // address computation and return early.
20478 unsigned ParentOp = Parent->getOpcode();
20479 if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
20480 (ParentOp == ISD::INTRINSIC_VOID))) {
20481 unsigned ID = Parent->getConstantOperandVal(Num: 1);
20482 if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
20483 SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
20484 ? Parent->getOperand(Num: 2)
20485 : Parent->getOperand(Num: 3);
20486 computeFlagsForAddressComputation(N: IntrinOp, FlagSet, DAG);
20487 FlagSet |= PPC::MOF_Vector;
20488 return FlagSet;
20489 }
20490 }
20491
20492 // Mark this as something we don't want to handle here if it is atomic
20493 // or pre-increment instruction.
20494 if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Val: Parent))
20495 if (LSB->isIndexed())
20496 return PPC::MOF_None;
20497
20498 // Compute in-memory type flags. This is based on if there are scalars,
20499 // floats or vectors.
20500 const MemSDNode *MN = dyn_cast<MemSDNode>(Val: Parent);
20501 assert(MN && "Parent should be a MemSDNode!");
20502 EVT MemVT = MN->getMemoryVT();
20503 unsigned Size = MemVT.getSizeInBits();
20504 if (MemVT.isScalarInteger()) {
20505 assert(Size <= 128 &&
20506 "Not expecting scalar integers larger than 16 bytes!");
20507 if (Size < 32)
20508 FlagSet |= PPC::MOF_SubWordInt;
20509 else if (Size == 32)
20510 FlagSet |= PPC::MOF_WordInt;
20511 else
20512 FlagSet |= PPC::MOF_DoubleWordInt;
20513 } else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
20514 if (Size == 128)
20515 FlagSet |= PPC::MOF_Vector;
20516 else if (Size == 256) {
20517 assert(Subtarget.pairedVectorMemops() &&
20518 "256-bit vectors are only available when paired vector memops is "
20519 "enabled!");
20520 FlagSet |= PPC::MOF_Vector;
20521 } else
20522 llvm_unreachable("Not expecting illegal vectors!");
20523 } else { // Floating point type: can be scalar, f128 or vector types.
20524 if (Size == 32 || Size == 64)
20525 FlagSet |= PPC::MOF_ScalarFloat;
20526 else if (MemVT == MVT::f128 || MemVT.isVector())
20527 FlagSet |= PPC::MOF_Vector;
20528 else
20529 llvm_unreachable("Not expecting illegal scalar floats!");
20530 }
20531
20532 // Compute flags for address computation.
20533 computeFlagsForAddressComputation(N, FlagSet, DAG);
20534
20535 // Compute type extension flags.
20536 if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Val: Parent)) {
20537 switch (LN->getExtensionType()) {
20538 case ISD::SEXTLOAD:
20539 FlagSet |= PPC::MOF_SExt;
20540 break;
20541 case ISD::EXTLOAD:
20542 case ISD::ZEXTLOAD:
20543 FlagSet |= PPC::MOF_ZExt;
20544 break;
20545 case ISD::NON_EXTLOAD:
20546 FlagSet |= PPC::MOF_NoExt;
20547 break;
20548 }
20549 } else
20550 FlagSet |= PPC::MOF_NoExt;
20551
20552 // For integers, no extension is the same as zero extension.
20553 // We set the extension mode to zero extension so we don't have
20554 // to add separate entries in AddrModesMap for loads and stores.
20555 if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {
20556 FlagSet |= PPC::MOF_ZExt;
20557 FlagSet &= ~PPC::MOF_NoExt;
20558 }
20559
20560 // If we don't have prefixed instructions, 34-bit constants should be
20561 // treated as PPC::MOF_NotAddNorCst so they can match D-Forms.
20562 bool IsNonP1034BitConst =
20563 ((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &
20564 FlagSet) == PPC::MOF_RPlusSImm34;
20565 if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&
20566 IsNonP1034BitConst)
20567 FlagSet |= PPC::MOF_NotAddNorCst;
20568
20569 return FlagSet;
20570}
20571
20572/// SelectForceXFormMode - Given the specified address, force it to be
20573/// represented as an indexed [r+r] operation (an XForm instruction).
20574PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
20575 SDValue &Base,
20576 SelectionDAG &DAG) const {
20577
20578 PPC::AddrMode Mode = PPC::AM_XForm;
20579 int16_t ForceXFormImm = 0;
20580 if (provablyDisjointOr(DAG, N) &&
20581 !isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: ForceXFormImm)) {
20582 Disp = N.getOperand(i: 0);
20583 Base = N.getOperand(i: 1);
20584 return Mode;
20585 }
20586
20587 // If the address is the result of an add, we will utilize the fact that the
20588 // address calculation includes an implicit add. However, we can reduce
20589 // register pressure if we do not materialize a constant just for use as the
20590 // index register. We only get rid of the add if it is not an add of a
20591 // value and a 16-bit signed constant and both have a single use.
20592 if (N.getOpcode() == ISD::ADD &&
20593 (!isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: ForceXFormImm) ||
20594 !N.getOperand(i: 1).hasOneUse() || !N.getOperand(i: 0).hasOneUse())) {
20595 Disp = N.getOperand(i: 0);
20596 Base = N.getOperand(i: 1);
20597 return Mode;
20598 }
20599
20600 // Otherwise, use R0 as the base register.
20601 Disp = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20602 VT: N.getValueType());
20603 Base = N;
20604
20605 return Mode;
20606}
20607
20608bool PPCTargetLowering::splitValueIntoRegisterParts(
20609 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
20610 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
20611 EVT ValVT = Val.getValueType();
20612 // If we are splitting a scalar integer into f64 parts (i.e. so they
20613 // can be placed into VFRC registers), we need to zero extend and
20614 // bitcast the values. This will ensure the value is placed into a
20615 // VSR using direct moves or stack operations as needed.
20616 if (PartVT == MVT::f64 &&
20617 (ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {
20618 Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
20619 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Val);
20620 Parts[0] = Val;
20621 return true;
20622 }
20623 return false;
20624}
20625
20626SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
20627 SelectionDAG &DAG) const {
20628 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20629 TargetLowering::CallLoweringInfo CLI(DAG);
20630 EVT RetVT = Op.getValueType();
20631 Type *RetTy = RetVT.getTypeForEVT(Context&: *DAG.getContext());
20632 SDValue Callee =
20633 DAG.getExternalSymbol(Sym: LibCallName, VT: TLI.getPointerTy(DL: DAG.getDataLayout()));
20634 bool SignExtend = TLI.shouldSignExtendTypeInLibCall(Ty: RetTy, IsSigned: false);
20635 TargetLowering::ArgListTy Args;
20636 for (const SDValue &N : Op->op_values()) {
20637 EVT ArgVT = N.getValueType();
20638 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
20639 TargetLowering::ArgListEntry Entry(N, ArgTy);
20640 Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(Ty: ArgTy, IsSigned: SignExtend);
20641 Entry.IsZExt = !Entry.IsSExt;
20642 Args.push_back(x: Entry);
20643 }
20644
20645 SDValue InChain = DAG.getEntryNode();
20646 SDValue TCChain = InChain;
20647 const Function &F = DAG.getMachineFunction().getFunction();
20648 bool isTailCall =
20649 TLI.isInTailCallPosition(DAG, Node: Op.getNode(), Chain&: TCChain) &&
20650 (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
20651 if (isTailCall)
20652 InChain = TCChain;
20653 CLI.setDebugLoc(SDLoc(Op))
20654 .setChain(InChain)
20655 .setLibCallee(CC: CallingConv::C, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args))
20656 .setTailCall(isTailCall)
20657 .setSExtResult(SignExtend)
20658 .setZExtResult(!SignExtend)
20659 .setIsPostTypeLegalization(true);
20660 return TLI.LowerCallTo(CLI).first;
20661}
20662
20663SDValue PPCTargetLowering::lowerLibCallBasedOnType(
20664 const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,
20665 SelectionDAG &DAG) const {
20666 if (Op.getValueType() == MVT::f32)
20667 return lowerToLibCall(LibCallName: LibCallFloatName, Op, DAG);
20668
20669 if (Op.getValueType() == MVT::f64)
20670 return lowerToLibCall(LibCallName: LibCallDoubleName, Op, DAG);
20671
20672 return SDValue();
20673}
20674
20675bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {
20676 SDNodeFlags Flags = Op.getNode()->getFlags();
20677 return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&
20678 Flags.hasNoNaNs() && Flags.hasNoInfs();
20679}
20680
20681bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
20682 return Op.getNode()->getFlags().hasApproximateFuncs();
20683}
20684
20685bool PPCTargetLowering::isScalarMASSConversionEnabled() const {
20686 return getTargetMachine().Options.PPCGenScalarMASSEntries;
20687}
20688
20689SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
20690 const char *LibCallFloatName,
20691 const char *LibCallDoubleNameFinite,
20692 const char *LibCallFloatNameFinite,
20693 SDValue Op,
20694 SelectionDAG &DAG) const {
20695 if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))
20696 return SDValue();
20697
20698 if (!isLowringToMASSFiniteSafe(Op))
20699 return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,
20700 DAG);
20701
20702 return lowerLibCallBasedOnType(LibCallFloatName: LibCallFloatNameFinite,
20703 LibCallDoubleName: LibCallDoubleNameFinite, Op, DAG);
20704}
20705
20706SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {
20707 return lowerLibCallBase(LibCallDoubleName: "__xl_pow", LibCallFloatName: "__xl_powf", LibCallDoubleNameFinite: "__xl_pow_finite",
20708 LibCallFloatNameFinite: "__xl_powf_finite", Op, DAG);
20709}
20710
20711SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {
20712 return lowerLibCallBase(LibCallDoubleName: "__xl_sin", LibCallFloatName: "__xl_sinf", LibCallDoubleNameFinite: "__xl_sin_finite",
20713 LibCallFloatNameFinite: "__xl_sinf_finite", Op, DAG);
20714}
20715
20716SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {
20717 return lowerLibCallBase(LibCallDoubleName: "__xl_cos", LibCallFloatName: "__xl_cosf", LibCallDoubleNameFinite: "__xl_cos_finite",
20718 LibCallFloatNameFinite: "__xl_cosf_finite", Op, DAG);
20719}
20720
20721SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {
20722 return lowerLibCallBase(LibCallDoubleName: "__xl_log", LibCallFloatName: "__xl_logf", LibCallDoubleNameFinite: "__xl_log_finite",
20723 LibCallFloatNameFinite: "__xl_logf_finite", Op, DAG);
20724}
20725
20726SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {
20727 return lowerLibCallBase(LibCallDoubleName: "__xl_log10", LibCallFloatName: "__xl_log10f", LibCallDoubleNameFinite: "__xl_log10_finite",
20728 LibCallFloatNameFinite: "__xl_log10f_finite", Op, DAG);
20729}
20730
20731SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {
20732 return lowerLibCallBase(LibCallDoubleName: "__xl_exp", LibCallFloatName: "__xl_expf", LibCallDoubleNameFinite: "__xl_exp_finite",
20733 LibCallFloatNameFinite: "__xl_expf_finite", Op, DAG);
20734}
20735
20736// If we happen to match to an aligned D-Form, check if the Frame Index is
20737// adequately aligned. If it is not, reset the mode to match to X-Form.
20738static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
20739 PPC::AddrMode &Mode) {
20740 if (!isa<FrameIndexSDNode>(Val: N))
20741 return;
20742 if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||
20743 (Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))
20744 Mode = PPC::AM_XForm;
20745}
20746
20747/// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
20748/// compute the address flags of the node, get the optimal address mode based
20749/// on the flags, and set the Base and Disp based on the address mode.
20750PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
20751 SDValue N, SDValue &Disp,
20752 SDValue &Base,
20753 SelectionDAG &DAG,
20754 MaybeAlign Align) const {
20755 SDLoc DL(Parent);
20756
20757 // Compute the address flags.
20758 unsigned Flags = computeMOFlags(Parent, N, DAG);
20759
20760 // Get the optimal address mode based on the Flags.
20761 PPC::AddrMode Mode = getAddrModeForFlags(Flags);
20762
20763 // If the address mode is DS-Form or DQ-Form, check if the FI is aligned.
20764 // Select an X-Form load if it is not.
20765 setXFormForUnalignedFI(N, Flags, Mode);
20766
20767 // Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.
20768 if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {
20769 assert(Subtarget.isUsingPCRelativeCalls() &&
20770 "Must be using PC-Relative calls when a valid PC-Relative node is "
20771 "present!");
20772 Mode = PPC::AM_PCRel;
20773 }
20774
20775 // Set Base and Disp accordingly depending on the address mode.
20776 switch (Mode) {
20777 case PPC::AM_DForm:
20778 case PPC::AM_DSForm:
20779 case PPC::AM_DQForm: {
20780 // This is a register plus a 16-bit immediate. The base will be the
20781 // register and the displacement will be the immediate unless it
20782 // isn't sufficiently aligned.
20783 if (Flags & PPC::MOF_RPlusSImm16) {
20784 SDValue Op0 = N.getOperand(i: 0);
20785 SDValue Op1 = N.getOperand(i: 1);
20786 int16_t Imm = Op1->getAsZExtVal();
20787 if (!Align || isAligned(Lhs: *Align, SizeInBytes: Imm)) {
20788 Disp = DAG.getSignedTargetConstant(Val: Imm, DL, VT: N.getValueType());
20789 Base = Op0;
20790 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: Op0)) {
20791 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20792 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
20793 }
20794 break;
20795 }
20796 }
20797 // This is a register plus the @lo relocation. The base is the register
20798 // and the displacement is the global address.
20799 else if (Flags & PPC::MOF_RPlusLo) {
20800 Disp = N.getOperand(i: 1).getOperand(i: 0); // The global address.
20801 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
20802 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
20803 Disp.getOpcode() == ISD::TargetConstantPool ||
20804 Disp.getOpcode() == ISD::TargetJumpTable);
20805 Base = N.getOperand(i: 0);
20806 break;
20807 }
20808 // This is a constant address at most 32 bits. The base will be
20809 // zero or load-immediate-shifted and the displacement will be
20810 // the low 16 bits of the address.
20811 else if (Flags & PPC::MOF_AddrIsSImm32) {
20812 auto *CN = cast<ConstantSDNode>(Val&: N);
20813 EVT CNType = CN->getValueType(ResNo: 0);
20814 uint64_t CNImm = CN->getZExtValue();
20815 // If this address fits entirely in a 16-bit sext immediate field, codegen
20816 // this as "d, 0".
20817 int16_t Imm;
20818 if (isIntS16Immediate(N: CN, Imm) && (!Align || isAligned(Lhs: *Align, SizeInBytes: Imm))) {
20819 Disp = DAG.getSignedTargetConstant(Val: Imm, DL, VT: CNType);
20820 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20821 VT: CNType);
20822 break;
20823 }
20824 // Handle 32-bit sext immediate with LIS + Addr mode.
20825 if ((CNType == MVT::i32 || isInt<32>(x: CNImm)) &&
20826 (!Align || isAligned(Lhs: *Align, SizeInBytes: CNImm))) {
20827 int32_t Addr = (int32_t)CNImm;
20828 // Otherwise, break this down into LIS + Disp.
20829 Disp = DAG.getSignedTargetConstant(Val: (int16_t)Addr, DL, VT: MVT::i32);
20830 Base = DAG.getSignedTargetConstant(Val: (Addr - (int16_t)Addr) >> 16, DL,
20831 VT: MVT::i32);
20832 uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
20833 Base = SDValue(DAG.getMachineNode(Opcode: LIS, dl: DL, VT: CNType, Op1: Base), 0);
20834 break;
20835 }
20836 }
20837 // Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.
20838 Disp = DAG.getTargetConstant(Val: 0, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
20839 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N)) {
20840 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20841 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
20842 } else
20843 Base = N;
20844 break;
20845 }
20846 case PPC::AM_PrefixDForm: {
20847 int64_t Imm34 = 0;
20848 unsigned Opcode = N.getOpcode();
20849 if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
20850 (isIntS34Immediate(Op: N.getOperand(i: 1), Imm&: Imm34))) {
20851 // N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
20852 Disp = DAG.getSignedTargetConstant(Val: Imm34, DL, VT: N.getValueType());
20853 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
20854 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20855 else
20856 Base = N.getOperand(i: 0);
20857 } else if (isIntS34Immediate(Op: N, Imm&: Imm34)) {
20858 // The address is a 34-bit signed immediate.
20859 Disp = DAG.getSignedTargetConstant(Val: Imm34, DL, VT: N.getValueType());
20860 Base = DAG.getRegister(Reg: PPC::ZERO8, VT: N.getValueType());
20861 }
20862 break;
20863 }
20864 case PPC::AM_PCRel: {
20865 // When selecting PC-Relative instructions, "Base" is not utilized as
20866 // we select the address as [PC+imm].
20867 Disp = N;
20868 break;
20869 }
20870 case PPC::AM_None:
20871 break;
20872 default: { // By default, X-Form is always available to be selected.
20873 // When a frame index is not aligned, we also match by XForm.
20874 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
20875 Base = FI ? N : N.getOperand(i: 1);
20876 Disp = FI ? DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20877 VT: N.getValueType())
20878 : N.getOperand(i: 0);
20879 break;
20880 }
20881 }
20882 return Mode;
20883}
20884
20885CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
20886 bool Return,
20887 bool IsVarArg) const {
20888 switch (CC) {
20889 case CallingConv::Cold:
20890 return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF);
20891 default:
20892 return CC_PPC64_ELF;
20893 }
20894}
20895
20896bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
20897 return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics();
20898}
20899
20900TargetLowering::AtomicExpansionKind
20901PPCTargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const {
20902 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20903 if (shouldInlineQuadwordAtomics() && Size == 128)
20904 return AtomicExpansionKind::MaskedIntrinsic;
20905
20906 switch (AI->getOperation()) {
20907 case AtomicRMWInst::UIncWrap:
20908 case AtomicRMWInst::UDecWrap:
20909 case AtomicRMWInst::USubCond:
20910 case AtomicRMWInst::USubSat:
20911 return AtomicExpansionKind::CmpXChg;
20912 default:
20913 return TargetLowering::shouldExpandAtomicRMWInIR(RMW: AI);
20914 }
20915
20916 llvm_unreachable("unreachable atomicrmw operation");
20917}
20918
20919TargetLowering::AtomicExpansionKind
20920PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(
20921 const AtomicCmpXchgInst *AI) const {
20922 unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
20923 if (shouldInlineQuadwordAtomics() && Size == 128)
20924 return AtomicExpansionKind::MaskedIntrinsic;
20925 return AtomicExpansionKind::LLSC;
20926}
20927
20928static Intrinsic::ID
20929getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
20930 switch (BinOp) {
20931 default:
20932 llvm_unreachable("Unexpected AtomicRMW BinOp");
20933 case AtomicRMWInst::Xchg:
20934 return Intrinsic::ppc_atomicrmw_xchg_i128;
20935 case AtomicRMWInst::Add:
20936 return Intrinsic::ppc_atomicrmw_add_i128;
20937 case AtomicRMWInst::Sub:
20938 return Intrinsic::ppc_atomicrmw_sub_i128;
20939 case AtomicRMWInst::And:
20940 return Intrinsic::ppc_atomicrmw_and_i128;
20941 case AtomicRMWInst::Or:
20942 return Intrinsic::ppc_atomicrmw_or_i128;
20943 case AtomicRMWInst::Xor:
20944 return Intrinsic::ppc_atomicrmw_xor_i128;
20945 case AtomicRMWInst::Nand:
20946 return Intrinsic::ppc_atomicrmw_nand_i128;
20947 }
20948}
20949
20950Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
20951 IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
20952 Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
20953 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
20954 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20955 Type *ValTy = Incr->getType();
20956 assert(ValTy->getPrimitiveSizeInBits() == 128);
20957 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
20958 Value *IncrLo = Builder.CreateTrunc(V: Incr, DestTy: Int64Ty, Name: "incr_lo");
20959 Value *IncrHi =
20960 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Incr, RHS: 64), DestTy: Int64Ty, Name: "incr_hi");
20961 Value *LoHi = Builder.CreateIntrinsic(
20962 ID: getIntrinsicForAtomicRMWBinOp128(BinOp: AI->getOperation()), OverloadTypes: {},
20963 Args: {AlignedAddr, IncrLo, IncrHi});
20964 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
20965 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
20966 Lo = Builder.CreateZExt(V: Lo, DestTy: ValTy, Name: "lo64");
20967 Hi = Builder.CreateZExt(V: Hi, DestTy: ValTy, Name: "hi64");
20968 return Builder.CreateOr(
20969 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValTy, V: 64)), Name: "val64");
20970}
20971
20972Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
20973 IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
20974 Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
20975 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
20976 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20977 Type *ValTy = CmpVal->getType();
20978 assert(ValTy->getPrimitiveSizeInBits() == 128);
20979 Function *IntCmpXchg =
20980 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::ppc_cmpxchg_i128);
20981 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
20982 Value *CmpLo = Builder.CreateTrunc(V: CmpVal, DestTy: Int64Ty, Name: "cmp_lo");
20983 Value *CmpHi =
20984 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: CmpVal, RHS: 64), DestTy: Int64Ty, Name: "cmp_hi");
20985 Value *NewLo = Builder.CreateTrunc(V: NewVal, DestTy: Int64Ty, Name: "new_lo");
20986 Value *NewHi =
20987 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: NewVal, RHS: 64), DestTy: Int64Ty, Name: "new_hi");
20988 emitLeadingFence(Builder, Inst: CI, Ord);
20989 Value *LoHi =
20990 Builder.CreateCall(Callee: IntCmpXchg, Args: {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi});
20991 emitTrailingFence(Builder, Inst: CI, Ord);
20992 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
20993 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
20994 Lo = Builder.CreateZExt(V: Lo, DestTy: ValTy, Name: "lo64");
20995 Hi = Builder.CreateZExt(V: Hi, DestTy: ValTy, Name: "hi64");
20996 return Builder.CreateOr(
20997 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValTy, V: 64)), Name: "val64");
20998}
20999
21000bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
21001 return Subtarget.useCRBits();
21002}
21003
21004/// Shuffle masks for vectors of bits are not legal as such vectors are
21005/// reserved for MMA/DM.
21006bool PPCTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
21007 if (VT.getScalarType() == MVT::i1)
21008 return false;
21009 return TargetLowering::isShuffleMaskLegal(Mask, VT);
21010}
21011
21012// Optimize the following patterns using vbpermq/vbpermd:
21013// i16 = bitcast(v16i1 truncate(v16i8))
21014// i8 = bitcast(v8i1 truncate(v8i16))
21015// i8 = bitcast(v8i1 truncate(v8i8))
21016SDValue PPCTargetLowering::DAGCombineBitcast(SDNode *N,
21017 DAGCombinerInfo &DCI) const {
21018 SDValue Op0 = N->getOperand(Num: 0);
21019 if (Op0.getOpcode() != ISD::TRUNCATE)
21020 return SDValue();
21021 SDValue Src = Op0.getOperand(i: 0);
21022 EVT ResVT = N->getValueType(ResNo: 0);
21023 EVT TruncResVT = Op0.getValueType();
21024 EVT SrcVT = Src.getValueType();
21025 SDLoc dl(N);
21026 SelectionDAG &DAG = DCI.DAG;
21027 bool IsLittleEndian = Subtarget.isLittleEndian();
21028
21029 if (ResVT != MVT::i16 && ResVT != MVT::i8)
21030 return SDValue();
21031 SDValue VBPerm =
21032 GenerateVBPERM(DAG, dl, Src, SrcVT, ResVT: TruncResVT, IsLE: IsLittleEndian);
21033 if (!VBPerm)
21034 return SDValue();
21035 SDValue ForExtract = DAG.getBitcast(VT: MVT::v4i32, V: VBPerm);
21036 SDValue Extracted =
21037 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i32, N1: ForExtract,
21038 N2: DAG.getIntPtrConstant(Val: IsLittleEndian ? 2 : 1, DL: dl));
21039 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ResVT, Operand: Extracted);
21040}
21041
21042SDValue PPCTargetLowering::GenerateVBPERM(SelectionDAG &DAG, SDLoc dl,
21043 SDValue Src, EVT SrcVT, EVT ResVT,
21044 bool IsLE) const {
21045 bool IsV16i8 = (ResVT == MVT::v16i1 && SrcVT == MVT::v16i8);
21046 bool IsV8i16 = (ResVT == MVT::v8i1 && SrcVT == MVT::v8i16);
21047 bool IsV8i8 = (ResVT == MVT::v8i1 && SrcVT == MVT::v8i8);
21048
21049 if (!IsV16i8 && !IsV8i16 && !IsV8i8)
21050 return SDValue();
21051
21052 if (IsV8i8) {
21053 Src = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT: MVT::v16i8,
21054 N1: DAG.getUNDEF(VT: MVT::v16i8), N2: Src,
21055 N3: DAG.getIntPtrConstant(Val: 0, DL: dl));
21056 }
21057 SmallVector<int, 16> BitIndices(16, 128);
21058 unsigned NumElts = SrcVT.getVectorNumElements();
21059 unsigned EltSize = SrcVT.getScalarType().getSizeInBits();
21060 for (int Idx = 0, End = SrcVT.getVectorNumElements(); Idx < End; Idx++) {
21061 BitIndices[Idx] = EltSize * (NumElts - Idx) - 1;
21062 if (IsV8i8 && IsLE)
21063 BitIndices[Idx] += 64;
21064 }
21065 if (!IsLE)
21066 std::reverse(first: BitIndices.begin(), last: BitIndices.end());
21067 SmallVector<SDValue, 16> BVOps;
21068 for (auto Idx : BitIndices)
21069 BVOps.push_back(Elt: DAG.getConstant(Val: Idx, DL: dl, VT: MVT::i8));
21070 SDValue VRB = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops: BVOps);
21071 return DAG.getNode(
21072 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::v16i8,
21073 N1: DAG.getConstant(Val: Intrinsic::ppc_altivec_vbpermq, DL: dl, VT: MVT::i32),
21074 N2: DAG.getBitcast(VT: MVT::v16i8, V: Src), N3: VRB);
21075}
21076
21077// For Power8/9, optimize vec splats of small FP values that can be
21078// represented as integers. Use vspltisw + xvcvsxwdp/xvcvsxwsp instead of
21079// loading from constant pool.
21080SDValue PPCTargetLowering::LowerVecSplatSmallFP(SDValue Op, SelectionDAG &DAG,
21081 bool BVNIsConstantSplat,
21082 unsigned SplatBitSize) const {
21083
21084 if (!BVNIsConstantSplat || !Subtarget.hasVSX() || !Subtarget.hasP8Vector() ||
21085 Subtarget.hasP10Vector())
21086 return SDValue();
21087
21088 EVT VT = Op->getValueType(ResNo: 0);
21089 if (!((SplatBitSize == 64 && VT == MVT::v2f64) ||
21090 (SplatBitSize == 32 && VT == MVT::v4f32)))
21091 return SDValue();
21092
21093 auto *CN = dyn_cast<ConstantFPSDNode>(Val: Op.getOperand(i: 0));
21094 if (!CN)
21095 return SDValue();
21096
21097 APFloat APFloatVal = CN->getValueAPF();
21098 bool IsExact;
21099 APSInt IntResult(16, false);
21100 APFloatVal.convertToInteger(Result&: IntResult, RM: APFloat::rmTowardZero, IsExact: &IsExact);
21101
21102 if (!(IsExact && IntResult <= 15 && IntResult >= -16 && !APFloatVal.isZero()))
21103 return SDValue();
21104
21105 int64_t IntVal = IntResult.getSExtValue();
21106
21107 SDLoc dl(Op);
21108 SDValue IntSplat = getCanonicalConstSplat(Val: IntVal, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
21109
21110 if (SplatBitSize == 64)
21111 return DAG.getNode(
21112 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::v2f64,
21113 N1: DAG.getConstant(Val: Intrinsic::ppc_vsx_xvcvsxwdp, DL: dl, VT: MVT::i32), N2: IntSplat);
21114
21115 return DAG.getNode(Opcode: PPCISD::XVCVSXWSP, DL: dl, VT: MVT::v4f32, Operand: IntSplat);
21116}
21117