1//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the PPCISelLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "PPCISelLowering.h"
14#include "MCTargetDesc/PPCMCTargetDesc.h"
15#include "MCTargetDesc/PPCPredicates.h"
16#include "PPC.h"
17#include "PPCCallingConv.h"
18#include "PPCFrameLowering.h"
19#include "PPCInstrInfo.h"
20#include "PPCMachineFunctionInfo.h"
21#include "PPCPerfectShuffle.h"
22#include "PPCRegisterInfo.h"
23#include "PPCSelectionDAGInfo.h"
24#include "PPCSubtarget.h"
25#include "PPCTargetMachine.h"
26#include "llvm/ADT/APFloat.h"
27#include "llvm/ADT/APInt.h"
28#include "llvm/ADT/APSInt.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SmallPtrSet.h"
33#include "llvm/ADT/SmallVector.h"
34#include "llvm/ADT/Statistic.h"
35#include "llvm/ADT/StringRef.h"
36#include "llvm/CodeGen/CallingConvLower.h"
37#include "llvm/CodeGen/ISDOpcodes.h"
38#include "llvm/CodeGen/LivePhysRegs.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
40#include "llvm/CodeGen/MachineFrameInfo.h"
41#include "llvm/CodeGen/MachineFunction.h"
42#include "llvm/CodeGen/MachineInstr.h"
43#include "llvm/CodeGen/MachineInstrBuilder.h"
44#include "llvm/CodeGen/MachineJumpTableInfo.h"
45#include "llvm/CodeGen/MachineLoopInfo.h"
46#include "llvm/CodeGen/MachineMemOperand.h"
47#include "llvm/CodeGen/MachineModuleInfo.h"
48#include "llvm/CodeGen/MachineOperand.h"
49#include "llvm/CodeGen/MachineRegisterInfo.h"
50#include "llvm/CodeGen/SelectionDAG.h"
51#include "llvm/CodeGen/SelectionDAGNodes.h"
52#include "llvm/CodeGen/TargetInstrInfo.h"
53#include "llvm/CodeGen/TargetLowering.h"
54#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
55#include "llvm/CodeGen/TargetRegisterInfo.h"
56#include "llvm/CodeGen/ValueTypes.h"
57#include "llvm/CodeGenTypes/MachineValueType.h"
58#include "llvm/IR/CallingConv.h"
59#include "llvm/IR/Constant.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GlobalValue.h"
66#include "llvm/IR/IRBuilder.h"
67#include "llvm/IR/Instructions.h"
68#include "llvm/IR/Intrinsics.h"
69#include "llvm/IR/IntrinsicsPowerPC.h"
70#include "llvm/IR/Module.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/Value.h"
74#include "llvm/MC/MCContext.h"
75#include "llvm/MC/MCExpr.h"
76#include "llvm/MC/MCSectionXCOFF.h"
77#include "llvm/MC/MCSymbolXCOFF.h"
78#include "llvm/Support/AtomicOrdering.h"
79#include "llvm/Support/BranchProbability.h"
80#include "llvm/Support/Casting.h"
81#include "llvm/Support/CodeGen.h"
82#include "llvm/Support/CommandLine.h"
83#include "llvm/Support/Compiler.h"
84#include "llvm/Support/Debug.h"
85#include "llvm/Support/ErrorHandling.h"
86#include "llvm/Support/Format.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Target/TargetMachine.h"
91#include "llvm/Target/TargetOptions.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <list>
97#include <optional>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "ppc-lowering"
104
105static cl::opt<bool> DisableP10StoreForward(
106 "disable-p10-store-forward",
107 cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
108 cl::init(Val: false));
109
110static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
111cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
112
113static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
114cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
115
116static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
117cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
118
119static cl::opt<bool> DisableSCO("disable-ppc-sco",
120cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
121
122static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
123cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
124
125static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
126cl::desc("use absolute jump tables on ppc"), cl::Hidden);
127
128static cl::opt<bool>
129 DisablePerfectShuffle("ppc-disable-perfect-shuffle",
130 cl::desc("disable vector permute decomposition"),
131 cl::init(Val: true), cl::Hidden);
132
133cl::opt<bool> DisableAutoPairedVecSt(
134 "disable-auto-paired-vec-st",
135 cl::desc("disable automatically generated 32byte paired vector stores"),
136 cl::init(Val: true), cl::Hidden);
137
138static cl::opt<unsigned> PPCMinimumJumpTableEntries(
139 "ppc-min-jump-table-entries", cl::init(Val: 64), cl::Hidden,
140 cl::desc("Set minimum number of entries to use a jump table on PPC"));
141
142static cl::opt<unsigned> PPCMinimumBitTestCmps(
143 "ppc-min-bit-test-cmps", cl::init(Val: 3), cl::Hidden,
144 cl::desc("Set minimum of largest number of comparisons to use bit test for "
145 "switch on PPC."));
146
147static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
148 "ppc-gather-alias-max-depth", cl::init(Val: 18), cl::Hidden,
149 cl::desc("max depth when checking alias info in GatherAllAliases()"));
150
151static cl::opt<unsigned> PPCAIXTLSModelOptUseIEForLDLimit(
152 "ppc-aix-shared-lib-tls-model-opt-limit", cl::init(Val: 1), cl::Hidden,
153 cl::desc("Set inclusive limit count of TLS local-dynamic access(es) in a "
154 "function to use initial-exec"));
155
156STATISTIC(NumTailCalls, "Number of tail calls");
157STATISTIC(NumSiblingCalls, "Number of sibling calls");
158STATISTIC(ShufflesHandledWithVPERM,
159 "Number of shuffles lowered to a VPERM or XXPERM");
160STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
161
162static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
163
164static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
165
166// A faster local-[exec|dynamic] TLS access sequence (enabled with the
167// -maix-small-local-[exec|dynamic]-tls option) can be produced for TLS
168// variables; consistent with the IBM XL compiler, we apply a max size of
169// slightly under 32KB.
170constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;
171
172// FIXME: Remove this once the bug has been fixed!
173extern cl::opt<bool> ANDIGlueBug;
174
175PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
176 const PPCSubtarget &STI)
177 : TargetLowering(TM, STI), Subtarget(STI) {
178 // Initialize map that relates the PPC addressing modes to the computed flags
179 // of a load/store instruction. The map is used to determine the optimal
180 // addressing mode when selecting load and stores.
181 initializeAddrModeMap();
182 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
183 // arguments are at least 4/8 bytes aligned.
184 bool isPPC64 = Subtarget.isPPC64();
185 setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
186 const MVT RegVT = Subtarget.getScalarIntVT();
187
188 // Set up the register classes.
189 addRegisterClass(VT: MVT::i32, RC: &PPC::GPRCRegClass);
190 if (!useSoftFloat()) {
191 if (hasSPE()) {
192 addRegisterClass(VT: MVT::f32, RC: &PPC::GPRCRegClass);
193 // EFPU2 APU only supports f32
194 if (!Subtarget.hasEFPU2())
195 addRegisterClass(VT: MVT::f64, RC: &PPC::SPERCRegClass);
196 } else {
197 addRegisterClass(VT: MVT::f32, RC: &PPC::F4RCRegClass);
198 addRegisterClass(VT: MVT::f64, RC: &PPC::F8RCRegClass);
199 }
200 }
201
202 setOperationAction(Op: ISD::UADDO, VT: RegVT, Action: Custom);
203 setOperationAction(Op: ISD::USUBO, VT: RegVT, Action: Custom);
204
205 // PowerPC uses addo_carry,subo_carry to propagate carry.
206 setOperationAction(Op: ISD::UADDO_CARRY, VT: RegVT, Action: Custom);
207 setOperationAction(Op: ISD::USUBO_CARRY, VT: RegVT, Action: Custom);
208
209 // On P10, the default lowering generates better code using the
210 // setbc instruction.
211 if (!Subtarget.hasP10Vector()) {
212 setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
213 setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
214 if (isPPC64) {
215 setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
216 setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
217 }
218 }
219
220 // Match BITREVERSE to customized fast code sequence in the td file.
221 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
222 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
223
224 // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
225 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: Custom);
226
227 // Custom lower inline assembly to check for special registers.
228 setOperationAction(Op: ISD::INLINEASM, VT: MVT::Other, Action: Custom);
229 setOperationAction(Op: ISD::INLINEASM_BR, VT: MVT::Other, Action: Custom);
230
231 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
232 for (MVT VT : MVT::integer_valuetypes()) {
233 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
234 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i8, Action: Expand);
235 }
236
237 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f16, Action: Expand);
238 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f128, Action: Expand);
239
240 if (Subtarget.isISA3_0()) {
241 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: MVT::f16, Action: Legal);
242 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Legal);
243 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Legal);
244 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Legal);
245 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Legal);
246 } else {
247 // No extending loads from f16 or HW conversions back and forth.
248 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: MVT::f16, Action: Expand);
249 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f128, Action: Expand);
250 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
251 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f64, Action: Expand);
252 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f64, Action: Expand);
253 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
254 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f32, Action: Expand);
255 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f32, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
258 }
259
260 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
261
262 // PowerPC has pre-inc load and store's.
263 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i1, Action: Legal);
264 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i8, Action: Legal);
265 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i16, Action: Legal);
266 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i32, Action: Legal);
267 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i64, Action: Legal);
268 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i1, Action: Legal);
269 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i8, Action: Legal);
270 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i16, Action: Legal);
271 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i32, Action: Legal);
272 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i64, Action: Legal);
273 if (!Subtarget.hasSPE()) {
274 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::f32, Action: Legal);
275 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::f64, Action: Legal);
276 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::f32, Action: Legal);
277 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::f64, Action: Legal);
278 }
279
280 if (Subtarget.useCRBits()) {
281 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand);
282
283 if (isPPC64 || Subtarget.hasFPCVT()) {
284 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i1, Action: Promote);
285 AddPromotedToType(Opc: ISD::STRICT_SINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
286 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i1, Action: Promote);
287 AddPromotedToType(Opc: ISD::STRICT_UINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
288
289 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i1, Action: Promote);
290 AddPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
291 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i1, Action: Promote);
292 AddPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
293
294 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i1, Action: Promote);
295 AddPromotedToType(Opc: ISD::STRICT_FP_TO_SINT, OrigVT: MVT::i1, DestVT: RegVT);
296 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i1, Action: Promote);
297 AddPromotedToType(Opc: ISD::STRICT_FP_TO_UINT, OrigVT: MVT::i1, DestVT: RegVT);
298
299 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i1, Action: Promote);
300 AddPromotedToType(Opc: ISD::FP_TO_SINT, OrigVT: MVT::i1, DestVT: RegVT);
301 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i1, Action: Promote);
302 AddPromotedToType(Opc: ISD::FP_TO_UINT, OrigVT: MVT::i1, DestVT: RegVT);
303 } else {
304 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i1, Action: Custom);
305 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i1, Action: Custom);
306 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i1, Action: Custom);
307 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i1, Action: Custom);
308 }
309
310 // PowerPC does not support direct load/store of condition registers.
311 setOperationAction(Op: ISD::LOAD, VT: MVT::i1, Action: Custom);
312 setOperationAction(Op: ISD::STORE, VT: MVT::i1, Action: Custom);
313
314 // FIXME: Remove this once the ANDI glue bug is fixed:
315 if (ANDIGlueBug)
316 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::i1, Action: Custom);
317
318 for (MVT VT : MVT::integer_valuetypes()) {
319 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
320 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
321 setTruncStoreAction(ValVT: VT, MemVT: MVT::i1, Action: Expand);
322 }
323
324 addRegisterClass(VT: MVT::i1, RC: &PPC::CRBITRCRegClass);
325 }
326
327 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
328 // PPC (the libcall is not available).
329 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::ppcf128, Action: Custom);
330 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::ppcf128, Action: Custom);
331 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::ppcf128, Action: Custom);
332 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::ppcf128, Action: Custom);
333
334 // We do not currently implement these libm ops for PowerPC.
335 setOperationAction(Op: ISD::FFLOOR, VT: MVT::ppcf128, Action: Expand);
336 setOperationAction(Op: ISD::FCEIL, VT: MVT::ppcf128, Action: Expand);
337 setOperationAction(Op: ISD::FTRUNC, VT: MVT::ppcf128, Action: Expand);
338 setOperationAction(Op: ISD::FRINT, VT: MVT::ppcf128, Action: Expand);
339 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::ppcf128, Action: Expand);
340 setOperationAction(Op: ISD::FREM, VT: MVT::ppcf128, Action: LibCall);
341
342 // PowerPC has no SREM/UREM instructions unless we are on P9
343 // On P9 we may use a hardware instruction to compute the remainder.
344 // When the result of both the remainder and the division is required it is
345 // more efficient to compute the remainder from the result of the division
346 // rather than use the remainder instruction. The instructions are legalized
347 // directly because the DivRemPairsPass performs the transformation at the IR
348 // level.
349 if (Subtarget.isISA3_0()) {
350 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Legal);
351 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Legal);
352 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Legal);
353 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Legal);
354 } else {
355 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
356 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
357 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
358 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
359 }
360
361 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
362 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
363 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
364 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
365 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
366 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
367 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
368 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
369 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
370
371 // Handle constrained floating-point operations of scalar.
372 // TODO: Handle SPE specific operation.
373 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f32, Action: Legal);
374 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f32, Action: Legal);
375 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f32, Action: Legal);
376 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f32, Action: Legal);
377 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Legal);
378
379 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f64, Action: Legal);
380 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f64, Action: Legal);
381 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f64, Action: Legal);
382 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f64, Action: Legal);
383
384 if (!Subtarget.hasSPE()) {
385 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f32, Action: Legal);
386 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f64, Action: Legal);
387 }
388
389 if (Subtarget.hasVSX()) {
390 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f32, Action: Legal);
391 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f64, Action: Legal);
392 }
393
394 if (Subtarget.hasFSQRT()) {
395 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f32, Action: Legal);
396 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f64, Action: Legal);
397 }
398
399 if (Subtarget.hasFPRND()) {
400 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f32, Action: Legal);
401 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f32, Action: Legal);
402 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f32, Action: Legal);
403 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f32, Action: Legal);
404
405 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f64, Action: Legal);
406 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f64, Action: Legal);
407 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f64, Action: Legal);
408 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f64, Action: Legal);
409 }
410
411 // We don't support sin/cos/sqrt/fmod/pow
412 setOperationAction(Op: ISD::FSIN , VT: MVT::f64, Action: Expand);
413 setOperationAction(Op: ISD::FCOS , VT: MVT::f64, Action: Expand);
414 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
415 setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: LibCall);
416 setOperationAction(Op: ISD::FPOW , VT: MVT::f64, Action: Expand);
417 setOperationAction(Op: ISD::FSIN , VT: MVT::f32, Action: Expand);
418 setOperationAction(Op: ISD::FCOS , VT: MVT::f32, Action: Expand);
419 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
420 setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: LibCall);
421 setOperationAction(Op: ISD::FPOW , VT: MVT::f32, Action: Expand);
422
423 // MASS transformation for LLVM intrinsics with replicating fast-math flag
424 // to be consistent to PPCGenScalarMASSEntries pass
425 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) {
426 setOperationAction(Op: ISD::FSIN , VT: MVT::f64, Action: Custom);
427 setOperationAction(Op: ISD::FCOS , VT: MVT::f64, Action: Custom);
428 setOperationAction(Op: ISD::FPOW , VT: MVT::f64, Action: Custom);
429 setOperationAction(Op: ISD::FLOG, VT: MVT::f64, Action: Custom);
430 setOperationAction(Op: ISD::FLOG10, VT: MVT::f64, Action: Custom);
431 setOperationAction(Op: ISD::FEXP, VT: MVT::f64, Action: Custom);
432 setOperationAction(Op: ISD::FSIN , VT: MVT::f32, Action: Custom);
433 setOperationAction(Op: ISD::FCOS , VT: MVT::f32, Action: Custom);
434 setOperationAction(Op: ISD::FPOW , VT: MVT::f32, Action: Custom);
435 setOperationAction(Op: ISD::FLOG, VT: MVT::f32, Action: Custom);
436 setOperationAction(Op: ISD::FLOG10, VT: MVT::f32, Action: Custom);
437 setOperationAction(Op: ISD::FEXP, VT: MVT::f32, Action: Custom);
438 }
439
440 if (Subtarget.hasSPE()) {
441 setOperationAction(Op: ISD::FMA , VT: MVT::f64, Action: Expand);
442 setOperationAction(Op: ISD::FMA , VT: MVT::f32, Action: Expand);
443 } else {
444 setOperationAction(Op: ISD::FMA , VT: MVT::f64, Action: Legal);
445 setOperationAction(Op: ISD::FMA , VT: MVT::f32, Action: Legal);
446 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
447 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
448 }
449
450 if (Subtarget.hasSPE())
451 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
452
453 // If we're enabling GP optimizations, use hardware square root
454 if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
455 setOperationAction(Op: ISD::FSQRT, VT: MVT::f64, Action: Expand);
456
457 if (!Subtarget.hasFSQRT() &&
458 !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
459 setOperationAction(Op: ISD::FSQRT, VT: MVT::f32, Action: Expand);
460
461 if (Subtarget.hasFCPSGN()) {
462 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Legal);
463 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Legal);
464 } else {
465 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Expand);
466 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Expand);
467 }
468
469 if (Subtarget.hasFPRND()) {
470 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
471 setOperationAction(Op: ISD::FCEIL, VT: MVT::f64, Action: Legal);
472 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f64, Action: Legal);
473 setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Legal);
474
475 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f32, Action: Legal);
476 setOperationAction(Op: ISD::FCEIL, VT: MVT::f32, Action: Legal);
477 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f32, Action: Legal);
478 setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Legal);
479 }
480
481 // Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP
482 // instruction xxbrd to speed up scalar BSWAP64.
483 if (Subtarget.isISA3_1()) {
484 setOperationAction(Op: ISD::BSWAP, VT: MVT::i32, Action: Legal);
485 setOperationAction(Op: ISD::BSWAP, VT: MVT::i64, Action: Legal);
486 } else {
487 setOperationAction(Op: ISD::BSWAP, VT: MVT::i32, Action: Expand);
488 setOperationAction(Op: ISD::BSWAP, VT: MVT::i64,
489 Action: (Subtarget.hasP9Vector() && isPPC64) ? Custom : Expand);
490 }
491
492 // CTPOP or CTTZ were introduced in P8/P9 respectively
493 if (Subtarget.isISA3_0()) {
494 setOperationAction(Op: ISD::CTTZ , VT: MVT::i32 , Action: Legal);
495 setOperationAction(Op: ISD::CTTZ , VT: MVT::i64 , Action: Legal);
496 } else {
497 setOperationAction(Op: ISD::CTTZ , VT: MVT::i32 , Action: Expand);
498 setOperationAction(Op: ISD::CTTZ , VT: MVT::i64 , Action: Expand);
499 }
500
501 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
502 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32 , Action: Legal);
503 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64 , Action: Legal);
504 } else {
505 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32 , Action: Expand);
506 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64 , Action: Expand);
507 }
508
509 // PowerPC does not have ROTR
510 setOperationAction(Op: ISD::ROTR, VT: MVT::i32 , Action: Expand);
511 setOperationAction(Op: ISD::ROTR, VT: MVT::i64 , Action: Expand);
512
513 if (!Subtarget.useCRBits()) {
514 // PowerPC does not have Select
515 setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Expand);
516 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Expand);
517 setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Expand);
518 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Expand);
519 }
520
521 // PowerPC wants to turn select_cc of FP into fsel when possible.
522 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
523 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
524
525 // PowerPC wants to optimize integer setcc a bit
526 if (!Subtarget.useCRBits())
527 setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
528
529 if (Subtarget.hasFPU()) {
530 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Legal);
531 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Legal);
532 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Legal);
533
534 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Legal);
535 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Legal);
536 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Legal);
537 }
538
539 // PowerPC does not have BRCOND which requires SetCC
540 if (!Subtarget.useCRBits())
541 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Expand);
542
543 setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Expand);
544
545 if (Subtarget.hasSPE()) {
546 // SPE has built-in conversions
547 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Legal);
548 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Legal);
549 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Legal);
550 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Legal);
551 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Legal);
552 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Legal);
553
554 // SPE supports signaling compare of f32/f64.
555 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Legal);
556 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Legal);
557 } else {
558 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
559 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
560 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
561
562 // PowerPC does not have [U|S]INT_TO_FP
563 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Expand);
564 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Expand);
565 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Expand);
566 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Expand);
567 }
568
569 if (Subtarget.hasDirectMove() && isPPC64) {
570 setOperationAction(Op: ISD::BITCAST, VT: MVT::f32, Action: Legal);
571 setOperationAction(Op: ISD::BITCAST, VT: MVT::i32, Action: Legal);
572 setOperationAction(Op: ISD::BITCAST, VT: MVT::i64, Action: Legal);
573 setOperationAction(Op: ISD::BITCAST, VT: MVT::f64, Action: Legal);
574
575 setOperationAction(Op: ISD::STRICT_LRINT, VT: MVT::f64, Action: Custom);
576 setOperationAction(Op: ISD::STRICT_LRINT, VT: MVT::f32, Action: Custom);
577 setOperationAction(Op: ISD::STRICT_LLRINT, VT: MVT::f64, Action: Custom);
578 setOperationAction(Op: ISD::STRICT_LLRINT, VT: MVT::f32, Action: Custom);
579 setOperationAction(Op: ISD::STRICT_LROUND, VT: MVT::f64, Action: Custom);
580 setOperationAction(Op: ISD::STRICT_LROUND, VT: MVT::f32, Action: Custom);
581 setOperationAction(Op: ISD::STRICT_LLROUND, VT: MVT::f64, Action: Custom);
582 setOperationAction(Op: ISD::STRICT_LLROUND, VT: MVT::f32, Action: Custom);
583 } else {
584 setOperationAction(Op: ISD::BITCAST, VT: MVT::f32, Action: Expand);
585 setOperationAction(Op: ISD::BITCAST, VT: MVT::i32, Action: Expand);
586 setOperationAction(Op: ISD::BITCAST, VT: MVT::i64, Action: Expand);
587 setOperationAction(Op: ISD::BITCAST, VT: MVT::f64, Action: Expand);
588 }
589
590 // We cannot sextinreg(i1). Expand to shifts.
591 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand);
592
593 // Custom handling for PowerPC ucmp instruction
594 setOperationAction(Op: ISD::UCMP, VT: MVT::i32, Action: Custom);
595 setOperationAction(Op: ISD::UCMP, VT: MVT::i64, Action: isPPC64 ? Custom : Expand);
596
597 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
598 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
599 // support continuation, user-level threading, and etc.. As a result, no
600 // other SjLj exception interfaces are implemented and please don't build
601 // your own exception handling based on them.
602 // LLVM/Clang supports zero-cost DWARF exception handling.
603 setOperationAction(Op: ISD::EH_SJLJ_SETJMP, VT: MVT::i32, Action: Custom);
604 setOperationAction(Op: ISD::EH_SJLJ_LONGJMP, VT: MVT::Other, Action: Custom);
605
606 // We want to legalize GlobalAddress and ConstantPool nodes into the
607 // appropriate instructions to materialize the address.
608 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i32, Action: Custom);
609 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i32, Action: Custom);
610 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i32, Action: Custom);
611 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i32, Action: Custom);
612 setOperationAction(Op: ISD::JumpTable, VT: MVT::i32, Action: Custom);
613 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
614 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
615 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
616 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
617 setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
618
619 // TRAP is legal.
620 setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
621
622 // TRAMPOLINE is custom lowered.
623 setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
624 setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
625
626 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
627 setOperationAction(Op: ISD::VASTART , VT: MVT::Other, Action: Custom);
628
629 if (Subtarget.is64BitELFABI()) {
630 // VAARG always uses double-word chunks, so promote anything smaller.
631 setOperationAction(Op: ISD::VAARG, VT: MVT::i1, Action: Promote);
632 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i1, DestVT: MVT::i64);
633 setOperationAction(Op: ISD::VAARG, VT: MVT::i8, Action: Promote);
634 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i8, DestVT: MVT::i64);
635 setOperationAction(Op: ISD::VAARG, VT: MVT::i16, Action: Promote);
636 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i16, DestVT: MVT::i64);
637 setOperationAction(Op: ISD::VAARG, VT: MVT::i32, Action: Promote);
638 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i32, DestVT: MVT::i64);
639 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Expand);
640 } else if (Subtarget.is32BitELFABI()) {
641 // VAARG is custom lowered with the 32-bit SVR4 ABI.
642 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
643 setOperationAction(Op: ISD::VAARG, VT: MVT::i64, Action: Custom);
644 } else
645 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Expand);
646
647 // VACOPY is custom lowered with the 32-bit SVR4 ABI.
648 if (Subtarget.is32BitELFABI())
649 setOperationAction(Op: ISD::VACOPY , VT: MVT::Other, Action: Custom);
650 else
651 setOperationAction(Op: ISD::VACOPY , VT: MVT::Other, Action: Expand);
652
653 // Use the default implementation.
654 setOperationAction(Op: ISD::VAEND , VT: MVT::Other, Action: Expand);
655 setOperationAction(Op: ISD::STACKSAVE , VT: MVT::Other, Action: Expand);
656 setOperationAction(Op: ISD::STACKRESTORE , VT: MVT::Other, Action: Custom);
657 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32 , Action: Custom);
658 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64 , Action: Custom);
659 setOperationAction(Op: ISD::GET_DYNAMIC_AREA_OFFSET, VT: MVT::i32, Action: Custom);
660 setOperationAction(Op: ISD::GET_DYNAMIC_AREA_OFFSET, VT: MVT::i64, Action: Custom);
661 setOperationAction(Op: ISD::EH_DWARF_CFA, VT: MVT::i32, Action: Custom);
662 setOperationAction(Op: ISD::EH_DWARF_CFA, VT: MVT::i64, Action: Custom);
663
664 if (Subtarget.isISA3_0() && isPPC64) {
665 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v16i1, Action: Custom);
666 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v8i1, Action: Custom);
667 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v4i1, Action: Custom);
668 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v2i1, Action: Custom);
669 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v16i1, Action: Custom);
670 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v8i1, Action: Custom);
671 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v4i1, Action: Custom);
672 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v2i1, Action: Custom);
673 }
674
675 // We want to custom lower some of our intrinsics.
676 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
677 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::f64, Action: Custom);
678 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::ppcf128, Action: Custom);
679 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::v4f32, Action: Custom);
680 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::v2f64, Action: Custom);
681
682 // To handle counter-based loop conditions.
683 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i1, Action: Custom);
684 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
685
686 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i8, Action: Custom);
687 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i16, Action: Custom);
688 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i32, Action: Custom);
689 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
690
691 // Comparisons that require checking two conditions.
692 if (Subtarget.hasSPE()) {
693 setCondCodeAction(CCs: ISD::SETO, VT: MVT::f32, Action: Expand);
694 setCondCodeAction(CCs: ISD::SETO, VT: MVT::f64, Action: Expand);
695 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::f32, Action: Expand);
696 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::f64, Action: Expand);
697 }
698 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f32, Action: Expand);
699 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f64, Action: Expand);
700 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f32, Action: Expand);
701 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f64, Action: Expand);
702 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f32, Action: Expand);
703 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f64, Action: Expand);
704 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f32, Action: Expand);
705 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f64, Action: Expand);
706 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f32, Action: Expand);
707 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f64, Action: Expand);
708 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f32, Action: Expand);
709 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f64, Action: Expand);
710
711 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f32, Action: Legal);
712 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f64, Action: Legal);
713
714 if (Subtarget.has64BitSupport()) {
715 // They also have instructions for converting between i64 and fp.
716 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
717 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Expand);
718 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
719 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Expand);
720 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
721 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Expand);
722 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
723 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Expand);
724 // This is just the low 32 bits of a (signed) fp->i64 conversion.
725 // We cannot do this with Promote because i64 is not a legal type.
726 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
727 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
728
729 if (Subtarget.hasLFIWAX() || isPPC64) {
730 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
731 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
732 }
733 } else {
734 // PowerPC does not have FP_TO_UINT on 32-bit implementations.
735 if (Subtarget.hasSPE()) {
736 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Legal);
737 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Legal);
738 } else {
739 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Expand);
740 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Expand);
741 }
742 }
743
744 // With the instructions enabled under FPCVT, we can do everything.
745 if (Subtarget.hasFPCVT()) {
746 if (Subtarget.has64BitSupport()) {
747 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
748 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
749 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
750 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
751 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
752 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
753 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
754 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
755 }
756
757 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
758 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
759 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
760 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
761 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
762 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
763 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
764 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
765 }
766
767 if (Subtarget.use64BitRegs()) {
768 // 64-bit PowerPC implementations can support i64 types directly
769 addRegisterClass(VT: MVT::i64, RC: &PPC::G8RCRegClass);
770 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
771 setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
772 // 64-bit PowerPC wants to expand i128 shifts itself.
773 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
774 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
775 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
776 } else {
777 // 32-bit PowerPC wants to expand i64 shifts itself.
778 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i32, Action: Custom);
779 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i32, Action: Custom);
780 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i32, Action: Custom);
781 }
782
783 // PowerPC has better expansions for funnel shifts than the generic
784 // TargetLowering::expandFunnelShift.
785 if (Subtarget.has64BitSupport()) {
786 setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
787 setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
788 }
789 setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
790 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
791
792 if (Subtarget.hasVSX()) {
793 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT: MVT::f64, Action: Legal);
794 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT: MVT::f32, Action: Legal);
795 setOperationAction(Op: ISD::FMINNUM_IEEE, VT: MVT::f64, Action: Legal);
796 setOperationAction(Op: ISD::FMINNUM_IEEE, VT: MVT::f32, Action: Legal);
797 setOperationAction(Op: ISD::FMAXNUM, VT: MVT::f64, Action: Legal);
798 setOperationAction(Op: ISD::FMAXNUM, VT: MVT::f32, Action: Legal);
799 setOperationAction(Op: ISD::FMINNUM, VT: MVT::f64, Action: Legal);
800 setOperationAction(Op: ISD::FMINNUM, VT: MVT::f32, Action: Legal);
801 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f64, Action: Legal);
802 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f32, Action: Legal);
803 }
804
805 if (Subtarget.hasAltivec()) {
806 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
807 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
808 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
809 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
810 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
811 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
812 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
813 }
814 // First set operation action for all vector types to expand. Then we
815 // will selectively turn on ones that can be effectively codegen'd.
816 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
817 // add/sub are legal for all supported vector VT's.
818 setOperationAction(Op: ISD::ADD, VT, Action: Legal);
819 setOperationAction(Op: ISD::SUB, VT, Action: Legal);
820
821 // For v2i64, these are only valid with P8Vector. This is corrected after
822 // the loop.
823 if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
824 setOperationAction(Op: ISD::SMAX, VT, Action: Legal);
825 setOperationAction(Op: ISD::SMIN, VT, Action: Legal);
826 setOperationAction(Op: ISD::UMAX, VT, Action: Legal);
827 setOperationAction(Op: ISD::UMIN, VT, Action: Legal);
828 }
829 else {
830 setOperationAction(Op: ISD::SMAX, VT, Action: Expand);
831 setOperationAction(Op: ISD::SMIN, VT, Action: Expand);
832 setOperationAction(Op: ISD::UMAX, VT, Action: Expand);
833 setOperationAction(Op: ISD::UMIN, VT, Action: Expand);
834 }
835
836 if (Subtarget.hasVSX()) {
837 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT, Action: Legal);
838 setOperationAction(Op: ISD::FMINNUM_IEEE, VT, Action: Legal);
839 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Legal);
840 setOperationAction(Op: ISD::FMINNUM, VT, Action: Legal);
841 setOperationAction(Op: ISD::FCANONICALIZE, VT, Action: Legal);
842 }
843
844 // Vector instructions introduced in P8
845 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
846 setOperationAction(Op: ISD::CTPOP, VT, Action: Legal);
847 setOperationAction(Op: ISD::CTLZ, VT, Action: Legal);
848 }
849 else {
850 setOperationAction(Op: ISD::CTPOP, VT, Action: Expand);
851 setOperationAction(Op: ISD::CTLZ, VT, Action: Expand);
852 }
853
854 // Vector instructions introduced in P9
855 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
856 setOperationAction(Op: ISD::CTTZ, VT, Action: Legal);
857 else
858 setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
859
860 // We promote all shuffles to v16i8.
861 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Promote);
862 AddPromotedToType (Opc: ISD::VECTOR_SHUFFLE, OrigVT: VT, DestVT: MVT::v16i8);
863
864 // We promote all non-typed operations to v4i32.
865 setOperationAction(Op: ISD::AND , VT, Action: Promote);
866 AddPromotedToType (Opc: ISD::AND , OrigVT: VT, DestVT: MVT::v4i32);
867 setOperationAction(Op: ISD::OR , VT, Action: Promote);
868 AddPromotedToType (Opc: ISD::OR , OrigVT: VT, DestVT: MVT::v4i32);
869 setOperationAction(Op: ISD::XOR , VT, Action: Promote);
870 AddPromotedToType (Opc: ISD::XOR , OrigVT: VT, DestVT: MVT::v4i32);
871 setOperationAction(Op: ISD::LOAD , VT, Action: Promote);
872 AddPromotedToType (Opc: ISD::LOAD , OrigVT: VT, DestVT: MVT::v4i32);
873 setOperationAction(Op: ISD::SELECT, VT, Action: Promote);
874 AddPromotedToType (Opc: ISD::SELECT, OrigVT: VT, DestVT: MVT::v4i32);
875 setOperationAction(Op: ISD::VSELECT, VT, Action: Legal);
876 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Promote);
877 AddPromotedToType (Opc: ISD::SELECT_CC, OrigVT: VT, DestVT: MVT::v4i32);
878 setOperationAction(Op: ISD::STORE, VT, Action: Promote);
879 AddPromotedToType (Opc: ISD::STORE, OrigVT: VT, DestVT: MVT::v4i32);
880
881 // No other operations are legal.
882 setOperationAction(Op: ISD::MUL , VT, Action: Expand);
883 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
884 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
885 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
886 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
887 setOperationAction(Op: ISD::FDIV, VT, Action: Expand);
888 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
889 setOperationAction(Op: ISD::FNEG, VT, Action: Expand);
890 setOperationAction(Op: ISD::FSQRT, VT, Action: Expand);
891 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
892 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
893 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
894 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
895 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
896 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
897 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
898 setOperationAction(Op: ISD::FABS, VT, Action: Expand);
899 setOperationAction(Op: ISD::FFLOOR, VT, Action: Expand);
900 setOperationAction(Op: ISD::FCEIL, VT, Action: Expand);
901 setOperationAction(Op: ISD::FTRUNC, VT, Action: Expand);
902 setOperationAction(Op: ISD::FRINT, VT, Action: Expand);
903 setOperationAction(Op: ISD::FLDEXP, VT, Action: Expand);
904 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Expand);
905 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Expand);
906 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Expand);
907 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Expand);
908 setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
909 setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
910 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
911 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
912 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
913 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
914 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: Expand);
915 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
916 setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
917 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
918 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
919 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
920
921 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
922 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
923 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
924 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
925 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
926 }
927 }
928 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::v4i32, Action: Expand);
929 if (!Subtarget.hasP8Vector()) {
930 setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Expand);
931 setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Expand);
932 setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Expand);
933 setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Expand);
934 }
935
936 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
937 // with merges, splats, etc.
938 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v16i8, Action: Custom);
939
940 // Vector truncates to sub-word integer that fit in an Altivec/VSX register
941 // are cheap, so handle them before they get expanded to scalar.
942 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v8i8, Action: Custom);
943 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v4i8, Action: Custom);
944 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v2i8, Action: Custom);
945 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v4i16, Action: Custom);
946 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v2i16, Action: Custom);
947
948 setOperationAction(Op: ISD::AND , VT: MVT::v4i32, Action: Legal);
949 setOperationAction(Op: ISD::OR , VT: MVT::v4i32, Action: Legal);
950 setOperationAction(Op: ISD::XOR , VT: MVT::v4i32, Action: Legal);
951 setOperationAction(Op: ISD::LOAD , VT: MVT::v4i32, Action: Legal);
952 setOperationAction(Op: ISD::SELECT, VT: MVT::v4i32,
953 Action: Subtarget.useCRBits() ? Legal : Expand);
954 setOperationAction(Op: ISD::STORE , VT: MVT::v4i32, Action: Legal);
955 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::v4i32, Action: Legal);
956 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::v4i32, Action: Legal);
957 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i32, Action: Legal);
958 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i32, Action: Legal);
959 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::v4i32, Action: Legal);
960 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::v4i32, Action: Legal);
961 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i32, Action: Legal);
962 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i32, Action: Legal);
963 setOperationAction(Op: ISD::FFLOOR, VT: MVT::v4f32, Action: Legal);
964 setOperationAction(Op: ISD::FCEIL, VT: MVT::v4f32, Action: Legal);
965 setOperationAction(Op: ISD::FTRUNC, VT: MVT::v4f32, Action: Legal);
966 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::v4f32, Action: Legal);
967
968 // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
969 setOperationAction(Op: ISD::ROTL, VT: MVT::v1i128, Action: Custom);
970 // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
971 if (Subtarget.hasAltivec())
972 for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
973 setOperationAction(Op: ISD::ROTL, VT, Action: Legal);
974 // With hasP8Altivec set, we can lower ISD::ROTL to vrld.
975 if (Subtarget.hasP8Altivec())
976 setOperationAction(Op: ISD::ROTL, VT: MVT::v2i64, Action: Legal);
977
978 addRegisterClass(VT: MVT::v4f32, RC: &PPC::VRRCRegClass);
979 addRegisterClass(VT: MVT::v4i32, RC: &PPC::VRRCRegClass);
980 addRegisterClass(VT: MVT::v8i16, RC: &PPC::VRRCRegClass);
981 addRegisterClass(VT: MVT::v16i8, RC: &PPC::VRRCRegClass);
982
983 setOperationAction(Op: ISD::MUL, VT: MVT::v4f32, Action: Legal);
984 setOperationAction(Op: ISD::FMA, VT: MVT::v4f32, Action: Legal);
985
986 if (Subtarget.hasVSX()) {
987 setOperationAction(Op: ISD::FDIV, VT: MVT::v4f32, Action: Legal);
988 setOperationAction(Op: ISD::FSQRT, VT: MVT::v4f32, Action: Legal);
989 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2f64, Action: Custom);
990 }
991
992 if (Subtarget.hasP8Altivec())
993 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Legal);
994 else
995 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
996
997 if (Subtarget.isISA3_1()) {
998 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Legal);
999 setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Legal);
1000 setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Legal);
1001 setOperationAction(Op: ISD::MULHS, VT: MVT::v4i32, Action: Legal);
1002 setOperationAction(Op: ISD::MULHU, VT: MVT::v4i32, Action: Legal);
1003 setOperationAction(Op: ISD::UDIV, VT: MVT::v2i64, Action: Legal);
1004 setOperationAction(Op: ISD::SDIV, VT: MVT::v2i64, Action: Legal);
1005 setOperationAction(Op: ISD::UDIV, VT: MVT::v4i32, Action: Legal);
1006 setOperationAction(Op: ISD::SDIV, VT: MVT::v4i32, Action: Legal);
1007 setOperationAction(Op: ISD::UREM, VT: MVT::v2i64, Action: Legal);
1008 setOperationAction(Op: ISD::SREM, VT: MVT::v2i64, Action: Legal);
1009 setOperationAction(Op: ISD::UREM, VT: MVT::v4i32, Action: Legal);
1010 setOperationAction(Op: ISD::SREM, VT: MVT::v4i32, Action: Legal);
1011 setOperationAction(Op: ISD::UREM, VT: MVT::v1i128, Action: Legal);
1012 setOperationAction(Op: ISD::SREM, VT: MVT::v1i128, Action: Legal);
1013 setOperationAction(Op: ISD::UDIV, VT: MVT::v1i128, Action: Legal);
1014 setOperationAction(Op: ISD::SDIV, VT: MVT::v1i128, Action: Legal);
1015 setOperationAction(Op: ISD::ROTL, VT: MVT::v1i128, Action: Legal);
1016 }
1017
1018 setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Legal);
1019 setOperationAction(Op: ISD::MUL, VT: MVT::v16i8, Action: Custom);
1020
1021 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4f32, Action: Custom);
1022 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4i32, Action: Custom);
1023 // LE is P8+/64-bit so direct moves are supported and these operations
1024 // are legal. The custom transformation requires 64-bit since we need a
1025 // pair of stores that will cover a 128-bit load for P10.
1026 if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
1027 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2i64, Action: Custom);
1028 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v8i16, Action: Custom);
1029 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v16i8, Action: Custom);
1030 }
1031
1032 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v16i8, Action: Custom);
1033 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v8i16, Action: Custom);
1034 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4i32, Action: Custom);
1035 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4f32, Action: Custom);
1036
1037 // Altivec does not contain unordered floating-point compare instructions
1038 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::v4f32, Action: Expand);
1039 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::v4f32, Action: Expand);
1040 setCondCodeAction(CCs: ISD::SETO, VT: MVT::v4f32, Action: Expand);
1041 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::v4f32, Action: Expand);
1042
1043 if (Subtarget.hasVSX()) {
1044 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2f64, Action: Legal);
1045 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f64, Action: Legal);
1046 if (Subtarget.hasP8Vector()) {
1047 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4f32, Action: Legal);
1048 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4f32, Action: Legal);
1049 }
1050 if (Subtarget.hasDirectMove() && isPPC64) {
1051 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v16i8, Action: Legal);
1052 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v8i16, Action: Legal);
1053 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4i32, Action: Legal);
1054 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2i64, Action: Legal);
1055 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v16i8, Action: Legal);
1056 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v8i16, Action: Legal);
1057 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4i32, Action: Legal);
1058 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2i64, Action: Legal);
1059 }
1060 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f64, Action: Legal);
1061
1062 // The nearbyint variants are not allowed to raise the inexact exception
1063 // so we can only code-gen them with fpexcept.ignore.
1064 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f64, Action: Custom);
1065 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f32, Action: Custom);
1066 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::v2f64, Action: Custom);
1067 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::v4f32, Action: Custom);
1068
1069 setOperationAction(Op: ISD::FFLOOR, VT: MVT::v2f64, Action: Legal);
1070 setOperationAction(Op: ISD::FCEIL, VT: MVT::v2f64, Action: Legal);
1071 setOperationAction(Op: ISD::FTRUNC, VT: MVT::v2f64, Action: Legal);
1072 setOperationAction(Op: ISD::FRINT, VT: MVT::v2f64, Action: Legal);
1073 setOperationAction(Op: ISD::FROUND, VT: MVT::v2f64, Action: Legal);
1074 setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Legal);
1075 setOperationAction(Op: ISD::FRINT, VT: MVT::f64, Action: Legal);
1076
1077 setOperationAction(Op: ISD::FRINT, VT: MVT::v4f32, Action: Legal);
1078 setOperationAction(Op: ISD::FROUND, VT: MVT::v4f32, Action: Legal);
1079 setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Legal);
1080 setOperationAction(Op: ISD::FRINT, VT: MVT::f32, Action: Legal);
1081
1082 setOperationAction(Op: ISD::MUL, VT: MVT::v2f64, Action: Legal);
1083 setOperationAction(Op: ISD::FMA, VT: MVT::v2f64, Action: Legal);
1084
1085 setOperationAction(Op: ISD::FDIV, VT: MVT::v2f64, Action: Legal);
1086 setOperationAction(Op: ISD::FSQRT, VT: MVT::v2f64, Action: Legal);
1087
1088 // Share the Altivec comparison restrictions.
1089 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::v2f64, Action: Expand);
1090 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::v2f64, Action: Expand);
1091 setCondCodeAction(CCs: ISD::SETO, VT: MVT::v2f64, Action: Expand);
1092 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::v2f64, Action: Expand);
1093
1094 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Legal);
1095 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Legal);
1096
1097 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2f64, Action: Custom);
1098
1099 if (Subtarget.hasP8Vector())
1100 addRegisterClass(VT: MVT::f32, RC: &PPC::VSSRCRegClass);
1101
1102 addRegisterClass(VT: MVT::f64, RC: &PPC::VSFRCRegClass);
1103
1104 addRegisterClass(VT: MVT::v4i32, RC: &PPC::VSRCRegClass);
1105 addRegisterClass(VT: MVT::v4f32, RC: &PPC::VSRCRegClass);
1106 addRegisterClass(VT: MVT::v2f64, RC: &PPC::VSRCRegClass);
1107
1108 if (Subtarget.hasP8Altivec()) {
1109 setOperationAction(Op: ISD::SHL, VT: MVT::v2i64, Action: Legal);
1110 setOperationAction(Op: ISD::SRA, VT: MVT::v2i64, Action: Legal);
1111 setOperationAction(Op: ISD::SRL, VT: MVT::v2i64, Action: Legal);
1112
1113 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1114 // SRL, but not for SRA because of the instructions available:
1115 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
1116 // doing
1117 setOperationAction(Op: ISD::SHL, VT: MVT::v1i128, Action: Expand);
1118 setOperationAction(Op: ISD::SRL, VT: MVT::v1i128, Action: Expand);
1119 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Expand);
1120
1121 setOperationAction(Op: ISD::SETCC, VT: MVT::v2i64, Action: Legal);
1122 }
1123 else {
1124 setOperationAction(Op: ISD::SHL, VT: MVT::v2i64, Action: Expand);
1125 setOperationAction(Op: ISD::SRA, VT: MVT::v2i64, Action: Expand);
1126 setOperationAction(Op: ISD::SRL, VT: MVT::v2i64, Action: Expand);
1127
1128 setOperationAction(Op: ISD::SETCC, VT: MVT::v2i64, Action: Custom);
1129
1130 // VSX v2i64 only supports non-arithmetic operations.
1131 setOperationAction(Op: ISD::ADD, VT: MVT::v2i64, Action: Expand);
1132 setOperationAction(Op: ISD::SUB, VT: MVT::v2i64, Action: Expand);
1133 }
1134
1135 if (Subtarget.isISA3_1())
1136 setOperationAction(Op: ISD::SETCC, VT: MVT::v1i128, Action: Legal);
1137 else
1138 setOperationAction(Op: ISD::SETCC, VT: MVT::v1i128, Action: Expand);
1139
1140 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
1141 AddPromotedToType (Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v2f64);
1142 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
1143 AddPromotedToType (Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v2f64);
1144
1145 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2i64, Action: Custom);
1146
1147 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1148 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1149 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::v2i64, Action: Legal);
1150 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::v2i64, Action: Legal);
1151 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1152 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1153 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::v2i64, Action: Legal);
1154 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::v2i64, Action: Legal);
1155
1156 // Custom handling for partial vectors of integers converted to
1157 // floating point. We already have optimal handling for v2i32 through
1158 // the DAG combine, so those aren't necessary.
1159 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1160 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1161 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1162 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1163 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1164 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1165 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1166 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1167 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1168 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1169 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1170 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1171 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1172 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1173 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1174 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1175
1176 setOperationAction(Op: ISD::FNEG, VT: MVT::v4f32, Action: Legal);
1177 setOperationAction(Op: ISD::FNEG, VT: MVT::v2f64, Action: Legal);
1178 setOperationAction(Op: ISD::FABS, VT: MVT::v4f32, Action: Legal);
1179 setOperationAction(Op: ISD::FABS, VT: MVT::v2f64, Action: Legal);
1180 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v4f32, Action: Legal);
1181 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v2f64, Action: Legal);
1182
1183 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2i64, Action: Custom);
1184 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2f64, Action: Custom);
1185
1186 // Handle constrained floating-point operations of vector.
1187 // The predictor is `hasVSX` because altivec instruction has
1188 // no exception but VSX vector instruction has.
1189 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::v4f32, Action: Legal);
1190 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::v4f32, Action: Legal);
1191 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::v4f32, Action: Legal);
1192 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::v4f32, Action: Legal);
1193 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::v4f32, Action: Legal);
1194 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::v4f32, Action: Legal);
1195 setOperationAction(Op: ISD::STRICT_FMAXNUM, VT: MVT::v4f32, Action: Legal);
1196 setOperationAction(Op: ISD::STRICT_FMINNUM, VT: MVT::v4f32, Action: Legal);
1197 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::v4f32, Action: Legal);
1198 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::v4f32, Action: Legal);
1199 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::v4f32, Action: Legal);
1200 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::v4f32, Action: Legal);
1201 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::v4f32, Action: Legal);
1202
1203 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::v2f64, Action: Legal);
1204 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::v2f64, Action: Legal);
1205 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::v2f64, Action: Legal);
1206 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::v2f64, Action: Legal);
1207 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::v2f64, Action: Legal);
1208 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::v2f64, Action: Legal);
1209 setOperationAction(Op: ISD::STRICT_FMAXNUM, VT: MVT::v2f64, Action: Legal);
1210 setOperationAction(Op: ISD::STRICT_FMINNUM, VT: MVT::v2f64, Action: Legal);
1211 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::v2f64, Action: Legal);
1212 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::v2f64, Action: Legal);
1213 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::v2f64, Action: Legal);
1214 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::v2f64, Action: Legal);
1215 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::v2f64, Action: Legal);
1216
1217 addRegisterClass(VT: MVT::v2i64, RC: &PPC::VSRCRegClass);
1218 addRegisterClass(VT: MVT::f128, RC: &PPC::VRRCRegClass);
1219
1220 for (MVT FPT : MVT::fp_valuetypes())
1221 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: FPT, Action: Expand);
1222
1223 // Expand the SELECT to SELECT_CC
1224 setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Expand);
1225
1226 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f64, Action: Expand);
1227 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f32, Action: Expand);
1228
1229 // No implementation for these ops for PowerPC.
1230 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
1231 setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
1232 setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
1233 setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
1234 setOperationAction(Op: ISD::FPOWI, VT: MVT::f128, Action: Expand);
1235 setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: LibCall);
1236 }
1237
1238 if (Subtarget.hasP8Altivec()) {
1239 addRegisterClass(VT: MVT::v2i64, RC: &PPC::VRRCRegClass);
1240 addRegisterClass(VT: MVT::v1i128, RC: &PPC::VRRCRegClass);
1241 }
1242
1243 if (Subtarget.hasP9Vector()) {
1244 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i32, Action: Custom);
1245 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4f32, Action: Custom);
1246
1247 // Test data class instructions store results in CR bits.
1248 if (Subtarget.useCRBits()) {
1249 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f32, Action: Custom);
1250 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f64, Action: Custom);
1251 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f128, Action: Custom);
1252 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::ppcf128, Action: Custom);
1253 }
1254
1255 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1256 // SRL, but not for SRA because of the instructions available:
1257 // VS{RL} and VS{RL}O.
1258 setOperationAction(Op: ISD::SHL, VT: MVT::v1i128, Action: Legal);
1259 setOperationAction(Op: ISD::SRL, VT: MVT::v1i128, Action: Legal);
1260 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Expand);
1261
1262 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: Legal);
1263 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: Legal);
1264 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: Legal);
1265 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: Legal);
1266 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Legal);
1267
1268 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Legal);
1269 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f128, Action: Expand);
1270 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f128, Action: Expand);
1271 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f128, Action: Expand);
1272 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f128, Action: Expand);
1273 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f128, Action: Expand);
1274 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f128, Action: Expand);
1275
1276 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Legal);
1277 setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Legal);
1278 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f128, Action: Legal);
1279 setOperationAction(Op: ISD::FCEIL, VT: MVT::f128, Action: Legal);
1280 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::f128, Action: Legal);
1281 setOperationAction(Op: ISD::FROUND, VT: MVT::f128, Action: Legal);
1282
1283 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Legal);
1284 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Legal);
1285 setOperationAction(Op: ISD::BITCAST, VT: MVT::i128, Action: Custom);
1286
1287 // Handle constrained floating-point operations of fp128
1288 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f128, Action: Legal);
1289 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f128, Action: Legal);
1290 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f128, Action: Legal);
1291 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f128, Action: Legal);
1292 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f128, Action: Legal);
1293 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f128, Action: Legal);
1294 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f128, Action: Legal);
1295 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Legal);
1296 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Legal);
1297 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f128, Action: Legal);
1298 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f128, Action: Legal);
1299 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f128, Action: Legal);
1300 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f128, Action: Legal);
1301 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f128, Action: Legal);
1302 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f128, Action: Legal);
1303 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Custom);
1304 setOperationAction(Op: ISD::BSWAP, VT: MVT::v8i16, Action: Legal);
1305 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i32, Action: Legal);
1306 setOperationAction(Op: ISD::BSWAP, VT: MVT::v2i64, Action: Legal);
1307 setOperationAction(Op: ISD::BSWAP, VT: MVT::v1i128, Action: Legal);
1308 } else if (Subtarget.hasVSX()) {
1309 setOperationAction(Op: ISD::LOAD, VT: MVT::f128, Action: Promote);
1310 setOperationAction(Op: ISD::STORE, VT: MVT::f128, Action: Promote);
1311
1312 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f128, DestVT: MVT::v4i32);
1313 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f128, DestVT: MVT::v4i32);
1314
1315 // Set FADD/FSUB as libcall to avoid the legalizer to expand the
1316 // fp_to_uint and int_to_fp.
1317 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
1318 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
1319
1320 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: Expand);
1321 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: Expand);
1322 setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
1323 setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
1324 setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
1325 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
1326 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
1327
1328 // Expand the fp_extend if the target type is fp128.
1329 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Expand);
1330 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f128, Action: Expand);
1331
1332 // Expand the fp_round if the source type is fp128.
1333 for (MVT VT : {MVT::f32, MVT::f64}) {
1334 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1335 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Custom);
1336 }
1337
1338 setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
1339 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
1340 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
1341 setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Expand);
1342
1343 // Lower following f128 select_cc pattern:
1344 // select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE
1345 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
1346
1347 // We need to handle f128 SELECT_CC with integer result type.
1348 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
1349 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: isPPC64 ? Custom : Expand);
1350 }
1351
1352 if (Subtarget.hasP9Altivec()) {
1353 if (Subtarget.isISA3_1()) {
1354 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2i64, Action: Legal);
1355 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v8i16, Action: Legal);
1356 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v16i8, Action: Legal);
1357 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i32, Action: Legal);
1358 } else {
1359 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v8i16, Action: Custom);
1360 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v16i8, Action: Custom);
1361 }
1362 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i8, Action: Legal);
1363 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i16, Action: Legal);
1364 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i32, Action: Legal);
1365 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i8, Action: Legal);
1366 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i16, Action: Legal);
1367 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i32, Action: Legal);
1368 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i64, Action: Legal);
1369
1370 setOperationAction(Op: ISD::ABDU, VT: MVT::v16i8, Action: Legal);
1371 setOperationAction(Op: ISD::ABDU, VT: MVT::v8i16, Action: Legal);
1372 setOperationAction(Op: ISD::ABDU, VT: MVT::v4i32, Action: Legal);
1373 setOperationAction(Op: ISD::ABDS, VT: MVT::v4i32, Action: Legal);
1374 }
1375
1376 if (Subtarget.hasP10Vector()) {
1377 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
1378 }
1379 }
1380
1381 if (Subtarget.pairedVectorMemops()) {
1382 addRegisterClass(VT: MVT::v256i1, RC: &PPC::VSRpRCRegClass);
1383 setOperationAction(Op: ISD::LOAD, VT: MVT::v256i1, Action: Custom);
1384 setOperationAction(Op: ISD::STORE, VT: MVT::v256i1, Action: Custom);
1385 }
1386 if (Subtarget.hasMMA()) {
1387 if (Subtarget.isISAFuture()) {
1388 addRegisterClass(VT: MVT::v512i1, RC: &PPC::WACCRCRegClass);
1389 addRegisterClass(VT: MVT::v1024i1, RC: &PPC::DMRRCRegClass);
1390 addRegisterClass(VT: MVT::v2048i1, RC: &PPC::DMRpRCRegClass);
1391 setOperationAction(Op: ISD::LOAD, VT: MVT::v1024i1, Action: Custom);
1392 setOperationAction(Op: ISD::STORE, VT: MVT::v1024i1, Action: Custom);
1393 setOperationAction(Op: ISD::LOAD, VT: MVT::v2048i1, Action: Custom);
1394 setOperationAction(Op: ISD::STORE, VT: MVT::v2048i1, Action: Custom);
1395 } else {
1396 addRegisterClass(VT: MVT::v512i1, RC: &PPC::UACCRCRegClass);
1397 }
1398 setOperationAction(Op: ISD::LOAD, VT: MVT::v512i1, Action: Custom);
1399 setOperationAction(Op: ISD::STORE, VT: MVT::v512i1, Action: Custom);
1400 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v512i1, Action: Custom);
1401 }
1402
1403 if (Subtarget.has64BitSupport())
1404 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Legal);
1405
1406 if (Subtarget.isISA3_1())
1407 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Legal);
1408
1409 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: isPPC64 ? Legal : Custom);
1410
1411 if (!isPPC64) {
1412 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i64, Action: Expand);
1413 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i64, Action: Expand);
1414 }
1415
1416 if (shouldInlineQuadwordAtomics()) {
1417 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
1418 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
1419 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i128, Action: Custom);
1420 }
1421
1422 setBooleanContents(ZeroOrOneBooleanContent);
1423
1424 if (Subtarget.hasAltivec()) {
1425 // Altivec instructions set fields to all zeros or all ones.
1426 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
1427 }
1428
1429 if (shouldInlineQuadwordAtomics())
1430 setMaxAtomicSizeInBitsSupported(128);
1431 else if (isPPC64)
1432 setMaxAtomicSizeInBitsSupported(64);
1433 else
1434 setMaxAtomicSizeInBitsSupported(32);
1435
1436 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
1437
1438 // We have target-specific dag combine patterns for the following nodes:
1439 setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::XOR, ISD::SHL, ISD::SRA,
1440 ISD::SRL, ISD::MUL, ISD::FMA, ISD::SINT_TO_FP,
1441 ISD::BUILD_VECTOR});
1442 if (Subtarget.hasFPCVT())
1443 setTargetDAGCombine(ISD::UINT_TO_FP);
1444 setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});
1445 if (Subtarget.useCRBits())
1446 setTargetDAGCombine(ISD::BRCOND);
1447 setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,
1448 ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});
1449
1450 setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});
1451
1452 setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});
1453
1454 if (Subtarget.useCRBits()) {
1455 setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
1456 }
1457
1458 // With 32 condition bits, we don't need to sink (and duplicate) compares
1459 // aggressively in CodeGenPrep.
1460 if (Subtarget.useCRBits()) {
1461 setJumpIsExpensive();
1462 }
1463
1464 // TODO: The default entry number is set to 64. This stops most jump table
1465 // generation on PPC. But it is good for current PPC HWs because the indirect
1466 // branch instruction mtctr to the jump table may lead to bad branch predict.
1467 // Re-evaluate this value on future HWs that can do better with mtctr.
1468 setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
1469
1470 // The default minimum of largest number in a BitTest cluster is 3.
1471 setMinimumBitTestCmps(PPCMinimumBitTestCmps);
1472
1473 setMinFunctionAlignment(Align(4));
1474 setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
1475
1476 auto CPUDirective = Subtarget.getCPUDirective();
1477 switch (CPUDirective) {
1478 default: break;
1479 case PPC::DIR_970:
1480 case PPC::DIR_A2:
1481 case PPC::DIR_E500:
1482 case PPC::DIR_E500mc:
1483 case PPC::DIR_E5500:
1484 case PPC::DIR_PWR4:
1485 case PPC::DIR_PWR5:
1486 case PPC::DIR_PWR5X:
1487 case PPC::DIR_PWR6:
1488 case PPC::DIR_PWR6X:
1489 case PPC::DIR_PWR7:
1490 case PPC::DIR_PWR8:
1491 case PPC::DIR_PWR9:
1492 case PPC::DIR_PWR10:
1493 case PPC::DIR_PWR11:
1494 case PPC::DIR_PWR_FUTURE:
1495 setPrefLoopAlignment(Align(16));
1496 setPrefFunctionAlignment(Align(16));
1497 break;
1498 }
1499
1500 if (Subtarget.enableMachineScheduler())
1501 setSchedulingPreference(Sched::Source);
1502 else
1503 setSchedulingPreference(Sched::Hybrid);
1504
1505 computeRegisterProperties(TRI: STI.getRegisterInfo());
1506
1507 // The Freescale cores do better with aggressive inlining of memcpy and
1508 // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
1509 if (CPUDirective == PPC::DIR_E500mc || CPUDirective == PPC::DIR_E5500) {
1510 MaxStoresPerMemset = 32;
1511 MaxStoresPerMemsetOptSize = 16;
1512 MaxStoresPerMemcpy = 32;
1513 MaxStoresPerMemcpyOptSize = 8;
1514 MaxStoresPerMemmove = 32;
1515 MaxStoresPerMemmoveOptSize = 8;
1516 } else if (CPUDirective == PPC::DIR_A2) {
1517 // The A2 also benefits from (very) aggressive inlining of memcpy and
1518 // friends. The overhead of a the function call, even when warm, can be
1519 // over one hundred cycles.
1520 MaxStoresPerMemset = 128;
1521 MaxStoresPerMemcpy = 128;
1522 MaxStoresPerMemmove = 128;
1523 MaxLoadsPerMemcmp = 128;
1524 } else {
1525 MaxLoadsPerMemcmp = 8;
1526 MaxLoadsPerMemcmpOptSize = 4;
1527 }
1528
1529 // Enable generation of STXVP instructions by default for mcpu=future.
1530 if (CPUDirective == PPC::DIR_PWR_FUTURE &&
1531 DisableAutoPairedVecSt.getNumOccurrences() == 0)
1532 DisableAutoPairedVecSt = false;
1533
1534 IsStrictFPEnabled = true;
1535
1536 // Let the subtarget (CPU) decide if a predictable select is more expensive
1537 // than the corresponding branch. This information is used in CGP to decide
1538 // when to convert selects into branches.
1539 PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
1540
1541 GatherAllAliasesMaxDepth = PPCGatherAllAliasesMaxDepth;
1542}
1543
1544// *********************************** NOTE ************************************
1545// For selecting load and store instructions, the addressing modes are defined
1546// as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD
1547// patterns to match the load the store instructions.
1548//
1549// The TD definitions for the addressing modes correspond to their respective
1550// Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely
1551// on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the
1552// address mode flags of a particular node. Afterwards, the computed address
1553// flags are passed into getAddrModeForFlags() in order to retrieve the optimal
1554// addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement
1555// accordingly, based on the preferred addressing mode.
1556//
1557// Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.
1558// MemOpFlags contains all the possible flags that can be used to compute the
1559// optimal addressing mode for load and store instructions.
1560// AddrMode contains all the possible load and store addressing modes available
1561// on Power (such as DForm, DSForm, DQForm, XForm, etc.)
1562//
1563// When adding new load and store instructions, it is possible that new address
1564// flags may need to be added into MemOpFlags, and a new addressing mode will
1565// need to be added to AddrMode. An entry of the new addressing mode (consisting
1566// of the minimal and main distinguishing address flags for the new load/store
1567// instructions) will need to be added into initializeAddrModeMap() below.
1568// Finally, when adding new addressing modes, the getAddrModeForFlags() will
1569// need to be updated to account for selecting the optimal addressing mode.
1570// *****************************************************************************
1571/// Initialize the map that relates the different addressing modes of the load
1572/// and store instructions to a set of flags. This ensures the load/store
1573/// instruction is correctly matched during instruction selection.
1574void PPCTargetLowering::initializeAddrModeMap() {
1575 AddrModesMap[PPC::AM_DForm] = {
1576 // LWZ, STW
1577 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,
1578 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,
1579 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1580 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1581 // LBZ, LHZ, STB, STH
1582 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1583 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1584 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1585 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1586 // LHA
1587 PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1588 PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1589 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1590 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1591 // LFS, LFD, STFS, STFD
1592 PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1593 PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1594 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1595 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1596 };
1597 AddrModesMap[PPC::AM_DSForm] = {
1598 // LWA
1599 PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,
1600 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1601 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1602 // LD, STD
1603 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,
1604 PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,
1605 PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,
1606 // DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf64
1607 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1608 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1609 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1610 };
1611 AddrModesMap[PPC::AM_DQForm] = {
1612 // LXV, STXV
1613 PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1614 PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1615 PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1616 };
1617 AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |
1618 PPC::MOF_SubtargetP10};
1619 // TODO: Add mapping for quadword load/store.
1620}
1621
1622/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1623/// the desired ByVal argument alignment.
1624static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
1625 if (MaxAlign == MaxMaxAlign)
1626 return;
1627 if (VectorType *VTy = dyn_cast<VectorType>(Val: Ty)) {
1628 if (MaxMaxAlign >= 32 &&
1629 VTy->getPrimitiveSizeInBits().getFixedValue() >= 256)
1630 MaxAlign = Align(32);
1631 else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 &&
1632 MaxAlign < 16)
1633 MaxAlign = Align(16);
1634 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
1635 Align EltAlign;
1636 getMaxByValAlign(Ty: ATy->getElementType(), MaxAlign&: EltAlign, MaxMaxAlign);
1637 if (EltAlign > MaxAlign)
1638 MaxAlign = EltAlign;
1639 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
1640 for (auto *EltTy : STy->elements()) {
1641 Align EltAlign;
1642 getMaxByValAlign(Ty: EltTy, MaxAlign&: EltAlign, MaxMaxAlign);
1643 if (EltAlign > MaxAlign)
1644 MaxAlign = EltAlign;
1645 if (MaxAlign == MaxMaxAlign)
1646 break;
1647 }
1648 }
1649}
1650
1651/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1652/// function arguments in the caller parameter area.
1653Align PPCTargetLowering::getByValTypeAlignment(Type *Ty,
1654 const DataLayout &DL) const {
1655 // 16byte and wider vectors are passed on 16byte boundary.
1656 // The rest is 8 on PPC64 and 4 on PPC32 boundary.
1657 Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
1658 if (Subtarget.hasAltivec())
1659 getMaxByValAlign(Ty, MaxAlign&: Alignment, MaxMaxAlign: Align(16));
1660 return Alignment;
1661}
1662
1663bool PPCTargetLowering::useSoftFloat() const {
1664 return Subtarget.useSoftFloat();
1665}
1666
1667bool PPCTargetLowering::hasSPE() const {
1668 return Subtarget.hasSPE();
1669}
1670
1671bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
1672 return VT.isScalarInteger();
1673}
1674
1675bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
1676 Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
1677 if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
1678 return false;
1679
1680 if (auto *VTy = dyn_cast<VectorType>(Val: VectorTy)) {
1681 if (VTy->getScalarType()->isIntegerTy()) {
1682 // ElemSizeInBits 8/16 can fit in immediate field, not needed here.
1683 if (ElemSizeInBits == 32) {
1684 Index = Subtarget.isLittleEndian() ? 2 : 1;
1685 return true;
1686 }
1687 if (ElemSizeInBits == 64) {
1688 Index = Subtarget.isLittleEndian() ? 1 : 0;
1689 return true;
1690 }
1691 }
1692 }
1693 return false;
1694}
1695
1696EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
1697 EVT VT) const {
1698 if (!VT.isVector())
1699 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
1700
1701 return VT.changeVectorElementTypeToInteger();
1702}
1703
1704bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1705 assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
1706 return true;
1707}
1708
1709//===----------------------------------------------------------------------===//
1710// Node matching predicates, for use by the tblgen matching code.
1711//===----------------------------------------------------------------------===//
1712
1713/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
1714static bool isFloatingPointZero(SDValue Op) {
1715 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op))
1716 return CFP->getValueAPF().isZero();
1717 else if (ISD::isEXTLoad(N: Op.getNode()) || ISD::isNON_EXTLoad(N: Op.getNode())) {
1718 // Maybe this has already been legalized into the constant pool?
1719 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Val: Op.getOperand(i: 1)))
1720 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: CP->getConstVal()))
1721 return CFP->getValueAPF().isZero();
1722 }
1723 return false;
1724}
1725
1726/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
1727/// true if Op is undef or if it matches the specified value.
1728static bool isConstantOrUndef(int Op, int Val) {
1729 return Op < 0 || Op == Val;
1730}
1731
1732/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
1733/// VPKUHUM instruction.
1734/// The ShuffleKind distinguishes between big-endian operations with
1735/// two different inputs (0), either-endian operations with two identical
1736/// inputs (1), and little-endian operations with two different inputs (2).
1737/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1738bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1739 SelectionDAG &DAG) {
1740 bool IsLE = DAG.getDataLayout().isLittleEndian();
1741 if (ShuffleKind == 0) {
1742 if (IsLE)
1743 return false;
1744 for (unsigned i = 0; i != 16; ++i)
1745 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2+1))
1746 return false;
1747 } else if (ShuffleKind == 2) {
1748 if (!IsLE)
1749 return false;
1750 for (unsigned i = 0; i != 16; ++i)
1751 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2))
1752 return false;
1753 } else if (ShuffleKind == 1) {
1754 unsigned j = IsLE ? 0 : 1;
1755 for (unsigned i = 0; i != 8; ++i)
1756 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2+j) ||
1757 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j))
1758 return false;
1759 }
1760 return true;
1761}
1762
1763/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
1764/// VPKUWUM instruction.
1765/// The ShuffleKind distinguishes between big-endian operations with
1766/// two different inputs (0), either-endian operations with two identical
1767/// inputs (1), and little-endian operations with two different inputs (2).
1768/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1769bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1770 SelectionDAG &DAG) {
1771 bool IsLE = DAG.getDataLayout().isLittleEndian();
1772 if (ShuffleKind == 0) {
1773 if (IsLE)
1774 return false;
1775 for (unsigned i = 0; i != 16; i += 2)
1776 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+2) ||
1777 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+3))
1778 return false;
1779 } else if (ShuffleKind == 2) {
1780 if (!IsLE)
1781 return false;
1782 for (unsigned i = 0; i != 16; i += 2)
1783 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2) ||
1784 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+1))
1785 return false;
1786 } else if (ShuffleKind == 1) {
1787 unsigned j = IsLE ? 0 : 2;
1788 for (unsigned i = 0; i != 8; i += 2)
1789 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+j) ||
1790 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+j+1) ||
1791 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j) ||
1792 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+9), Val: i*2+j+1))
1793 return false;
1794 }
1795 return true;
1796}
1797
1798/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
1799/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
1800/// current subtarget.
1801///
1802/// The ShuffleKind distinguishes between big-endian operations with
1803/// two different inputs (0), either-endian operations with two identical
1804/// inputs (1), and little-endian operations with two different inputs (2).
1805/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1806bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1807 SelectionDAG &DAG) {
1808 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
1809 if (!Subtarget.hasP8Vector())
1810 return false;
1811
1812 bool IsLE = DAG.getDataLayout().isLittleEndian();
1813 if (ShuffleKind == 0) {
1814 if (IsLE)
1815 return false;
1816 for (unsigned i = 0; i != 16; i += 4)
1817 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+4) ||
1818 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+5) ||
1819 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+6) ||
1820 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+7))
1821 return false;
1822 } else if (ShuffleKind == 2) {
1823 if (!IsLE)
1824 return false;
1825 for (unsigned i = 0; i != 16; i += 4)
1826 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2) ||
1827 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+1) ||
1828 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+2) ||
1829 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+3))
1830 return false;
1831 } else if (ShuffleKind == 1) {
1832 unsigned j = IsLE ? 0 : 4;
1833 for (unsigned i = 0; i != 8; i += 4)
1834 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+j) ||
1835 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+j+1) ||
1836 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+j+2) ||
1837 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+j+3) ||
1838 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j) ||
1839 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+9), Val: i*2+j+1) ||
1840 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+10), Val: i*2+j+2) ||
1841 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+11), Val: i*2+j+3))
1842 return false;
1843 }
1844 return true;
1845}
1846
1847/// isVMerge - Common function, used to match vmrg* shuffles.
1848///
1849static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
1850 unsigned LHSStart, unsigned RHSStart) {
1851 if (N->getValueType(ResNo: 0) != MVT::v16i8)
1852 return false;
1853 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
1854 "Unsupported merge size!");
1855
1856 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
1857 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
1858 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i*UnitSize*2+j),
1859 Val: LHSStart+j+i*UnitSize) ||
1860 !isConstantOrUndef(Op: N->getMaskElt(Idx: i*UnitSize*2+UnitSize+j),
1861 Val: RHSStart+j+i*UnitSize))
1862 return false;
1863 }
1864 return true;
1865}
1866
1867/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
1868/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
1869/// The ShuffleKind distinguishes between big-endian merges with two
1870/// different inputs (0), either-endian merges with two identical inputs (1),
1871/// and little-endian merges with two different inputs (2). For the latter,
1872/// the input operands are swapped (see PPCInstrAltivec.td).
1873bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1874 unsigned ShuffleKind, SelectionDAG &DAG) {
1875 if (DAG.getDataLayout().isLittleEndian()) {
1876 if (ShuffleKind == 1) // unary
1877 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 0);
1878 else if (ShuffleKind == 2) // swapped
1879 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 16);
1880 else
1881 return false;
1882 } else {
1883 if (ShuffleKind == 1) // unary
1884 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 8);
1885 else if (ShuffleKind == 0) // normal
1886 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 24);
1887 else
1888 return false;
1889 }
1890}
1891
1892/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
1893/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
1894/// The ShuffleKind distinguishes between big-endian merges with two
1895/// different inputs (0), either-endian merges with two identical inputs (1),
1896/// and little-endian merges with two different inputs (2). For the latter,
1897/// the input operands are swapped (see PPCInstrAltivec.td).
1898bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1899 unsigned ShuffleKind, SelectionDAG &DAG) {
1900 if (DAG.getDataLayout().isLittleEndian()) {
1901 if (ShuffleKind == 1) // unary
1902 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 8);
1903 else if (ShuffleKind == 2) // swapped
1904 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 24);
1905 else
1906 return false;
1907 } else {
1908 if (ShuffleKind == 1) // unary
1909 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 0);
1910 else if (ShuffleKind == 0) // normal
1911 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 16);
1912 else
1913 return false;
1914 }
1915}
1916
1917/**
1918 * Common function used to match vmrgew and vmrgow shuffles
1919 *
1920 * The indexOffset determines whether to look for even or odd words in
1921 * the shuffle mask. This is based on the of the endianness of the target
1922 * machine.
1923 * - Little Endian:
1924 * - Use offset of 0 to check for odd elements
1925 * - Use offset of 4 to check for even elements
1926 * - Big Endian:
1927 * - Use offset of 0 to check for even elements
1928 * - Use offset of 4 to check for odd elements
1929 * A detailed description of the vector element ordering for little endian and
1930 * big endian can be found at
1931 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
1932 * Targeting your applications - what little endian and big endian IBM XL C/C++
1933 * compiler differences mean to you
1934 *
1935 * The mask to the shuffle vector instruction specifies the indices of the
1936 * elements from the two input vectors to place in the result. The elements are
1937 * numbered in array-access order, starting with the first vector. These vectors
1938 * are always of type v16i8, thus each vector will contain 16 elements of size
1939 * 8. More info on the shuffle vector can be found in the
1940 * http://llvm.org/docs/LangRef.html#shufflevector-instruction
1941 * Language Reference.
1942 *
1943 * The RHSStartValue indicates whether the same input vectors are used (unary)
1944 * or two different input vectors are used, based on the following:
1945 * - If the instruction uses the same vector for both inputs, the range of the
1946 * indices will be 0 to 15. In this case, the RHSStart value passed should
1947 * be 0.
1948 * - If the instruction has two different vectors then the range of the
1949 * indices will be 0 to 31. In this case, the RHSStart value passed should
1950 * be 16 (indices 0-15 specify elements in the first vector while indices 16
1951 * to 31 specify elements in the second vector).
1952 *
1953 * \param[in] N The shuffle vector SD Node to analyze
1954 * \param[in] IndexOffset Specifies whether to look for even or odd elements
1955 * \param[in] RHSStartValue Specifies the starting index for the righthand input
1956 * vector to the shuffle_vector instruction
1957 * \return true iff this shuffle vector represents an even or odd word merge
1958 */
1959static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
1960 unsigned RHSStartValue) {
1961 if (N->getValueType(ResNo: 0) != MVT::v16i8)
1962 return false;
1963
1964 for (unsigned i = 0; i < 2; ++i)
1965 for (unsigned j = 0; j < 4; ++j)
1966 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i*4+j),
1967 Val: i*RHSStartValue+j+IndexOffset) ||
1968 !isConstantOrUndef(Op: N->getMaskElt(Idx: i*4+j+8),
1969 Val: i*RHSStartValue+j+IndexOffset+8))
1970 return false;
1971 return true;
1972}
1973
1974/**
1975 * Determine if the specified shuffle mask is suitable for the vmrgew or
1976 * vmrgow instructions.
1977 *
1978 * \param[in] N The shuffle vector SD Node to analyze
1979 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
1980 * \param[in] ShuffleKind Identify the type of merge:
1981 * - 0 = big-endian merge with two different inputs;
1982 * - 1 = either-endian merge with two identical inputs;
1983 * - 2 = little-endian merge with two different inputs (inputs are swapped for
1984 * little-endian merges).
1985 * \param[in] DAG The current SelectionDAG
1986 * \return true iff this shuffle mask
1987 */
1988bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
1989 unsigned ShuffleKind, SelectionDAG &DAG) {
1990 if (DAG.getDataLayout().isLittleEndian()) {
1991 unsigned indexOffset = CheckEven ? 4 : 0;
1992 if (ShuffleKind == 1) // Unary
1993 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 0);
1994 else if (ShuffleKind == 2) // swapped
1995 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 16);
1996 else
1997 return false;
1998 }
1999 else {
2000 unsigned indexOffset = CheckEven ? 0 : 4;
2001 if (ShuffleKind == 1) // Unary
2002 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 0);
2003 else if (ShuffleKind == 0) // Normal
2004 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 16);
2005 else
2006 return false;
2007 }
2008 return false;
2009}
2010
2011/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
2012/// amount, otherwise return -1.
2013/// The ShuffleKind distinguishes between big-endian operations with two
2014/// different inputs (0), either-endian operations with two identical inputs
2015/// (1), and little-endian operations with two different inputs (2). For the
2016/// latter, the input operands are swapped (see PPCInstrAltivec.td).
2017int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
2018 SelectionDAG &DAG) {
2019 if (N->getValueType(ResNo: 0) != MVT::v16i8)
2020 return -1;
2021
2022 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val: N);
2023
2024 // Find the first non-undef value in the shuffle mask.
2025 unsigned i;
2026 for (i = 0; i != 16 && SVOp->getMaskElt(Idx: i) < 0; ++i)
2027 /*search*/;
2028
2029 if (i == 16) return -1; // all undef.
2030
2031 // Otherwise, check to see if the rest of the elements are consecutively
2032 // numbered from this value.
2033 unsigned ShiftAmt = SVOp->getMaskElt(Idx: i);
2034 if (ShiftAmt < i) return -1;
2035
2036 ShiftAmt -= i;
2037 bool isLE = DAG.getDataLayout().isLittleEndian();
2038
2039 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
2040 // Check the rest of the elements to see if they are consecutive.
2041 for (++i; i != 16; ++i)
2042 if (!isConstantOrUndef(Op: SVOp->getMaskElt(Idx: i), Val: ShiftAmt+i))
2043 return -1;
2044 } else if (ShuffleKind == 1) {
2045 // Check the rest of the elements to see if they are consecutive.
2046 for (++i; i != 16; ++i)
2047 if (!isConstantOrUndef(Op: SVOp->getMaskElt(Idx: i), Val: (ShiftAmt+i) & 15))
2048 return -1;
2049 } else
2050 return -1;
2051
2052 if (isLE)
2053 ShiftAmt = 16 - ShiftAmt;
2054
2055 return ShiftAmt;
2056}
2057
2058/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
2059/// specifies a splat of a single element that is suitable for input to
2060/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
2061bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
2062 EVT VT = N->getValueType(ResNo: 0);
2063 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2064 return EltSize == 8 && N->getMaskElt(Idx: 0) == N->getMaskElt(Idx: 1);
2065
2066 assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&
2067 EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
2068
2069 // The consecutive indices need to specify an element, not part of two
2070 // different elements. So abandon ship early if this isn't the case.
2071 if (N->getMaskElt(Idx: 0) % EltSize != 0)
2072 return false;
2073
2074 // This is a splat operation if each element of the permute is the same, and
2075 // if the value doesn't reference the second vector.
2076 unsigned ElementBase = N->getMaskElt(Idx: 0);
2077
2078 // FIXME: Handle UNDEF elements too!
2079 if (ElementBase >= 16)
2080 return false;
2081
2082 // Check that the indices are consecutive, in the case of a multi-byte element
2083 // splatted with a v16i8 mask.
2084 for (unsigned i = 1; i != EltSize; ++i)
2085 if (N->getMaskElt(Idx: i) < 0 || N->getMaskElt(Idx: i) != (int)(i+ElementBase))
2086 return false;
2087
2088 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
2089 // An UNDEF element is a sequence of UNDEF bytes.
2090 if (N->getMaskElt(Idx: i) < 0) {
2091 for (unsigned j = 1; j != EltSize; ++j)
2092 if (N->getMaskElt(Idx: i + j) >= 0)
2093 return false;
2094 } else
2095 for (unsigned j = 0; j != EltSize; ++j)
2096 if (N->getMaskElt(Idx: i + j) != N->getMaskElt(Idx: j))
2097 return false;
2098 }
2099 return true;
2100}
2101
2102/// Check that the mask is shuffling N byte elements. Within each N byte
2103/// element of the mask, the indices could be either in increasing or
2104/// decreasing order as long as they are consecutive.
2105/// \param[in] N the shuffle vector SD Node to analyze
2106/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
2107/// Word/DoubleWord/QuadWord).
2108/// \param[in] StepLen the delta indices number among the N byte element, if
2109/// the mask is in increasing/decreasing order then it is 1/-1.
2110/// \return true iff the mask is shuffling N byte elements.
2111static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
2112 int StepLen) {
2113 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
2114 "Unexpected element width.");
2115 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
2116
2117 unsigned NumOfElem = 16 / Width;
2118 unsigned MaskVal[16]; // Width is never greater than 16
2119 for (unsigned i = 0; i < NumOfElem; ++i) {
2120 MaskVal[0] = N->getMaskElt(Idx: i * Width);
2121 if ((StepLen == 1) && (MaskVal[0] % Width)) {
2122 return false;
2123 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
2124 return false;
2125 }
2126
2127 for (unsigned int j = 1; j < Width; ++j) {
2128 MaskVal[j] = N->getMaskElt(Idx: i * Width + j);
2129 if (MaskVal[j] != MaskVal[j-1] + StepLen) {
2130 return false;
2131 }
2132 }
2133 }
2134
2135 return true;
2136}
2137
2138bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2139 unsigned &InsertAtByte, bool &Swap, bool IsLE) {
2140 if (!isNByteElemShuffleMask(N, Width: 4, StepLen: 1))
2141 return false;
2142
2143 // Now we look at mask elements 0,4,8,12
2144 unsigned M0 = N->getMaskElt(Idx: 0) / 4;
2145 unsigned M1 = N->getMaskElt(Idx: 4) / 4;
2146 unsigned M2 = N->getMaskElt(Idx: 8) / 4;
2147 unsigned M3 = N->getMaskElt(Idx: 12) / 4;
2148 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
2149 unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
2150
2151 // Below, let H and L be arbitrary elements of the shuffle mask
2152 // where H is in the range [4,7] and L is in the range [0,3].
2153 // H, 1, 2, 3 or L, 5, 6, 7
2154 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
2155 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
2156 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
2157 InsertAtByte = IsLE ? 12 : 0;
2158 Swap = M0 < 4;
2159 return true;
2160 }
2161 // 0, H, 2, 3 or 4, L, 6, 7
2162 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
2163 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
2164 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
2165 InsertAtByte = IsLE ? 8 : 4;
2166 Swap = M1 < 4;
2167 return true;
2168 }
2169 // 0, 1, H, 3 or 4, 5, L, 7
2170 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
2171 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
2172 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
2173 InsertAtByte = IsLE ? 4 : 8;
2174 Swap = M2 < 4;
2175 return true;
2176 }
2177 // 0, 1, 2, H or 4, 5, 6, L
2178 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
2179 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
2180 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
2181 InsertAtByte = IsLE ? 0 : 12;
2182 Swap = M3 < 4;
2183 return true;
2184 }
2185
2186 // If both vector operands for the shuffle are the same vector, the mask will
2187 // contain only elements from the first one and the second one will be undef.
2188 if (N->getOperand(Num: 1).isUndef()) {
2189 ShiftElts = 0;
2190 Swap = true;
2191 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
2192 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
2193 InsertAtByte = IsLE ? 12 : 0;
2194 return true;
2195 }
2196 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
2197 InsertAtByte = IsLE ? 8 : 4;
2198 return true;
2199 }
2200 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
2201 InsertAtByte = IsLE ? 4 : 8;
2202 return true;
2203 }
2204 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
2205 InsertAtByte = IsLE ? 0 : 12;
2206 return true;
2207 }
2208 }
2209
2210 return false;
2211}
2212
2213bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2214 bool &Swap, bool IsLE) {
2215 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2216 // Ensure each byte index of the word is consecutive.
2217 if (!isNByteElemShuffleMask(N, Width: 4, StepLen: 1))
2218 return false;
2219
2220 // Now we look at mask elements 0,4,8,12, which are the beginning of words.
2221 unsigned M0 = N->getMaskElt(Idx: 0) / 4;
2222 unsigned M1 = N->getMaskElt(Idx: 4) / 4;
2223 unsigned M2 = N->getMaskElt(Idx: 8) / 4;
2224 unsigned M3 = N->getMaskElt(Idx: 12) / 4;
2225
2226 // If both vector operands for the shuffle are the same vector, the mask will
2227 // contain only elements from the first one and the second one will be undef.
2228 if (N->getOperand(Num: 1).isUndef()) {
2229 assert(M0 < 4 && "Indexing into an undef vector?");
2230 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
2231 return false;
2232
2233 ShiftElts = IsLE ? (4 - M0) % 4 : M0;
2234 Swap = false;
2235 return true;
2236 }
2237
2238 // Ensure each word index of the ShuffleVector Mask is consecutive.
2239 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
2240 return false;
2241
2242 if (IsLE) {
2243 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
2244 // Input vectors don't need to be swapped if the leading element
2245 // of the result is one of the 3 left elements of the second vector
2246 // (or if there is no shift to be done at all).
2247 Swap = false;
2248 ShiftElts = (8 - M0) % 8;
2249 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
2250 // Input vectors need to be swapped if the leading element
2251 // of the result is one of the 3 left elements of the first vector
2252 // (or if we're shifting by 4 - thereby simply swapping the vectors).
2253 Swap = true;
2254 ShiftElts = (4 - M0) % 4;
2255 }
2256
2257 return true;
2258 } else { // BE
2259 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
2260 // Input vectors don't need to be swapped if the leading element
2261 // of the result is one of the 4 elements of the first vector.
2262 Swap = false;
2263 ShiftElts = M0;
2264 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
2265 // Input vectors need to be swapped if the leading element
2266 // of the result is one of the 4 elements of the right vector.
2267 Swap = true;
2268 ShiftElts = M0 - 4;
2269 }
2270
2271 return true;
2272 }
2273}
2274
2275bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
2276 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2277
2278 if (!isNByteElemShuffleMask(N, Width, StepLen: -1))
2279 return false;
2280
2281 for (int i = 0; i < 16; i += Width)
2282 if (N->getMaskElt(Idx: i) != i + Width - 1)
2283 return false;
2284
2285 return true;
2286}
2287
2288bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
2289 return isXXBRShuffleMaskHelper(N, Width: 2);
2290}
2291
2292bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
2293 return isXXBRShuffleMaskHelper(N, Width: 4);
2294}
2295
2296bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
2297 return isXXBRShuffleMaskHelper(N, Width: 8);
2298}
2299
2300bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
2301 return isXXBRShuffleMaskHelper(N, Width: 16);
2302}
2303
2304/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
2305/// if the inputs to the instruction should be swapped and set \p DM to the
2306/// value for the immediate.
2307/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
2308/// AND element 0 of the result comes from the first input (LE) or second input
2309/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
2310/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
2311/// mask.
2312bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
2313 bool &Swap, bool IsLE) {
2314 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2315
2316 // Ensure each byte index of the double word is consecutive.
2317 if (!isNByteElemShuffleMask(N, Width: 8, StepLen: 1))
2318 return false;
2319
2320 unsigned M0 = N->getMaskElt(Idx: 0) / 8;
2321 unsigned M1 = N->getMaskElt(Idx: 8) / 8;
2322 assert(((M0 | M1) < 4) && "A mask element out of bounds?");
2323
2324 // If both vector operands for the shuffle are the same vector, the mask will
2325 // contain only elements from the first one and the second one will be undef.
2326 if (N->getOperand(Num: 1).isUndef()) {
2327 if ((M0 | M1) < 2) {
2328 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
2329 Swap = false;
2330 return true;
2331 } else
2332 return false;
2333 }
2334
2335 if (IsLE) {
2336 if (M0 > 1 && M1 < 2) {
2337 Swap = false;
2338 } else if (M0 < 2 && M1 > 1) {
2339 M0 = (M0 + 2) % 4;
2340 M1 = (M1 + 2) % 4;
2341 Swap = true;
2342 } else
2343 return false;
2344
2345 // Note: if control flow comes here that means Swap is already set above
2346 DM = (((~M1) & 1) << 1) + ((~M0) & 1);
2347 return true;
2348 } else { // BE
2349 if (M0 < 2 && M1 > 1) {
2350 Swap = false;
2351 } else if (M0 > 1 && M1 < 2) {
2352 M0 = (M0 + 2) % 4;
2353 M1 = (M1 + 2) % 4;
2354 Swap = true;
2355 } else
2356 return false;
2357
2358 // Note: if control flow comes here that means Swap is already set above
2359 DM = (M0 << 1) + (M1 & 1);
2360 return true;
2361 }
2362}
2363
2364
2365/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
2366/// appropriate for PPC mnemonics (which have a big endian bias - namely
2367/// elements are counted from the left of the vector register).
2368unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
2369 SelectionDAG &DAG) {
2370 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val: N);
2371 assert(isSplatShuffleMask(SVOp, EltSize));
2372 EVT VT = SVOp->getValueType(ResNo: 0);
2373
2374 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2375 return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(Idx: 0)
2376 : SVOp->getMaskElt(Idx: 0);
2377
2378 if (DAG.getDataLayout().isLittleEndian())
2379 return (16 / EltSize) - 1 - (SVOp->getMaskElt(Idx: 0) / EltSize);
2380 else
2381 return SVOp->getMaskElt(Idx: 0) / EltSize;
2382}
2383
2384/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
2385/// by using a vspltis[bhw] instruction of the specified element size, return
2386/// the constant being splatted. The ByteSize field indicates the number of
2387/// bytes of each element [124] -> [bhw].
2388SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
2389 SDValue OpVal;
2390
2391 // If ByteSize of the splat is bigger than the element size of the
2392 // build_vector, then we have a case where we are checking for a splat where
2393 // multiple elements of the buildvector are folded together into a single
2394 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
2395 unsigned EltSize = 16/N->getNumOperands();
2396 if (EltSize < ByteSize) {
2397 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
2398 SDValue UniquedVals[4];
2399 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
2400
2401 // See if all of the elements in the buildvector agree across.
2402 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2403 if (N->getOperand(Num: i).isUndef()) continue;
2404 // If the element isn't a constant, bail fully out.
2405 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: i))) return SDValue();
2406
2407 if (!UniquedVals[i&(Multiple-1)].getNode())
2408 UniquedVals[i&(Multiple-1)] = N->getOperand(Num: i);
2409 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(Num: i))
2410 return SDValue(); // no match.
2411 }
2412
2413 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
2414 // either constant or undef values that are identical for each chunk. See
2415 // if these chunks can form into a larger vspltis*.
2416
2417 // Check to see if all of the leading entries are either 0 or -1. If
2418 // neither, then this won't fit into the immediate field.
2419 bool LeadingZero = true;
2420 bool LeadingOnes = true;
2421 for (unsigned i = 0; i != Multiple-1; ++i) {
2422 if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
2423
2424 LeadingZero &= isNullConstant(V: UniquedVals[i]);
2425 LeadingOnes &= isAllOnesConstant(V: UniquedVals[i]);
2426 }
2427 // Finally, check the least significant entry.
2428 if (LeadingZero) {
2429 if (!UniquedVals[Multiple-1].getNode())
2430 return DAG.getTargetConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32); // 0,0,0,undef
2431 int Val = UniquedVals[Multiple - 1]->getAsZExtVal();
2432 if (Val < 16) // 0,0,0,4 -> vspltisw(4)
2433 return DAG.getTargetConstant(Val, DL: SDLoc(N), VT: MVT::i32);
2434 }
2435 if (LeadingOnes) {
2436 if (!UniquedVals[Multiple-1].getNode())
2437 return DAG.getTargetConstant(Val: ~0U, DL: SDLoc(N), VT: MVT::i32); // -1,-1,-1,undef
2438 int Val =cast<ConstantSDNode>(Val&: UniquedVals[Multiple-1])->getSExtValue();
2439 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
2440 return DAG.getTargetConstant(Val, DL: SDLoc(N), VT: MVT::i32);
2441 }
2442
2443 return SDValue();
2444 }
2445
2446 // Check to see if this buildvec has a single non-undef value in its elements.
2447 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2448 if (N->getOperand(Num: i).isUndef()) continue;
2449 if (!OpVal.getNode())
2450 OpVal = N->getOperand(Num: i);
2451 else if (OpVal != N->getOperand(Num: i))
2452 return SDValue();
2453 }
2454
2455 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
2456
2457 unsigned ValSizeInBytes = EltSize;
2458 uint64_t Value = 0;
2459 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: OpVal)) {
2460 Value = CN->getZExtValue();
2461 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Val&: OpVal)) {
2462 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
2463 Value = llvm::bit_cast<uint32_t>(from: CN->getValueAPF().convertToFloat());
2464 }
2465
2466 // If the splat value is larger than the element value, then we can never do
2467 // this splat. The only case that we could fit the replicated bits into our
2468 // immediate field for would be zero, and we prefer to use vxor for it.
2469 if (ValSizeInBytes < ByteSize) return SDValue();
2470
2471 // If the element value is larger than the splat value, check if it consists
2472 // of a repeated bit pattern of size ByteSize.
2473 if (!APInt(ValSizeInBytes * 8, Value).isSplat(SplatSizeInBits: ByteSize * 8))
2474 return SDValue();
2475
2476 // Properly sign extend the value.
2477 int MaskVal = SignExtend32(X: Value, B: ByteSize * 8);
2478
2479 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
2480 if (MaskVal == 0) return SDValue();
2481
2482 // Finally, if this value fits in a 5 bit sext field, return it
2483 if (SignExtend32<5>(X: MaskVal) == MaskVal)
2484 return DAG.getSignedTargetConstant(Val: MaskVal, DL: SDLoc(N), VT: MVT::i32);
2485 return SDValue();
2486}
2487
2488//===----------------------------------------------------------------------===//
2489// Addressing Mode Selection
2490//===----------------------------------------------------------------------===//
2491
2492/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
2493/// or 64-bit immediate, and if the value can be accurately represented as a
2494/// sign extension from a 16-bit value. If so, this returns true and the
2495/// immediate.
2496bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
2497 if (!isa<ConstantSDNode>(Val: N))
2498 return false;
2499
2500 Imm = (int16_t)N->getAsZExtVal();
2501 if (N->getValueType(ResNo: 0) == MVT::i32)
2502 return Imm == (int32_t)N->getAsZExtVal();
2503 else
2504 return Imm == (int64_t)N->getAsZExtVal();
2505}
2506bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
2507 return isIntS16Immediate(N: Op.getNode(), Imm);
2508}
2509
2510/// Used when computing address flags for selecting loads and stores.
2511/// If we have an OR, check if the LHS and RHS are provably disjoint.
2512/// An OR of two provably disjoint values is equivalent to an ADD.
2513/// Most PPC load/store instructions compute the effective address as a sum,
2514/// so doing this conversion is useful.
2515static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
2516 if (N.getOpcode() != ISD::OR)
2517 return false;
2518 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2519 if (!LHSKnown.Zero.getBoolValue())
2520 return false;
2521 KnownBits RHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 1));
2522 return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);
2523}
2524
2525/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
2526/// be represented as an indexed [r+r] operation.
2527bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
2528 SDValue &Index,
2529 SelectionDAG &DAG) const {
2530 for (SDNode *U : N->users()) {
2531 if (MemSDNode *Memop = dyn_cast<MemSDNode>(Val: U)) {
2532 if (Memop->getMemoryVT() == MVT::f64) {
2533 Base = N.getOperand(i: 0);
2534 Index = N.getOperand(i: 1);
2535 return true;
2536 }
2537 }
2538 }
2539 return false;
2540}
2541
2542/// isIntS34Immediate - This method tests if value of node given can be
2543/// accurately represented as a sign extension from a 34-bit value. If so,
2544/// this returns true and the immediate.
2545bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
2546 if (!isa<ConstantSDNode>(Val: N))
2547 return false;
2548
2549 Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
2550 return isInt<34>(x: Imm);
2551}
2552bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
2553 return isIntS34Immediate(N: Op.getNode(), Imm);
2554}
2555
2556/// SelectAddressRegReg - Given the specified addressed, check to see if it
2557/// can be represented as an indexed [r+r] operation. Returns false if it
2558/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
2559/// non-zero and N can be represented by a base register plus a signed 16-bit
2560/// displacement, make a more precise judgement by checking (displacement % \p
2561/// EncodingAlignment).
2562bool PPCTargetLowering::SelectAddressRegReg(
2563 SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
2564 MaybeAlign EncodingAlignment) const {
2565 // If we have a PC Relative target flag don't select as [reg+reg]. It will be
2566 // a [pc+imm].
2567 if (SelectAddressPCRel(N, Base))
2568 return false;
2569
2570 int16_t Imm = 0;
2571 if (N.getOpcode() == ISD::ADD) {
2572 // Is there any SPE load/store (f64), which can't handle 16bit offset?
2573 // SPE load/store can only handle 8-bit offsets.
2574 if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
2575 return true;
2576 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm) &&
2577 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm)))
2578 return false; // r+i
2579 if (N.getOperand(i: 1).getOpcode() == PPCISD::Lo)
2580 return false; // r+i
2581
2582 Base = N.getOperand(i: 0);
2583 Index = N.getOperand(i: 1);
2584 return true;
2585 } else if (N.getOpcode() == ISD::OR) {
2586 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm) &&
2587 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm)))
2588 return false; // r+i can fold it if we can.
2589
2590 // If this is an or of disjoint bitfields, we can codegen this as an add
2591 // (for better address arithmetic) if the LHS and RHS of the OR are provably
2592 // disjoint.
2593 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2594
2595 if (LHSKnown.Zero.getBoolValue()) {
2596 KnownBits RHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 1));
2597 // If all of the bits are known zero on the LHS or RHS, the add won't
2598 // carry.
2599 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
2600 Base = N.getOperand(i: 0);
2601 Index = N.getOperand(i: 1);
2602 return true;
2603 }
2604 }
2605 }
2606
2607 return false;
2608}
2609
2610// If we happen to be doing an i64 load or store into a stack slot that has
2611// less than a 4-byte alignment, then the frame-index elimination may need to
2612// use an indexed load or store instruction (because the offset may not be a
2613// multiple of 4). The extra register needed to hold the offset comes from the
2614// register scavenger, and it is possible that the scavenger will need to use
2615// an emergency spill slot. As a result, we need to make sure that a spill slot
2616// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
2617// stack slot.
2618static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
2619 // FIXME: This does not handle the LWA case.
2620 if (VT != MVT::i64)
2621 return;
2622
2623 // NOTE: We'll exclude negative FIs here, which come from argument
2624 // lowering, because there are no known test cases triggering this problem
2625 // using packed structures (or similar). We can remove this exclusion if
2626 // we find such a test case. The reason why this is so test-case driven is
2627 // because this entire 'fixup' is only to prevent crashes (from the
2628 // register scavenger) on not-really-valid inputs. For example, if we have:
2629 // %a = alloca i1
2630 // %b = bitcast i1* %a to i64*
2631 // store i64* a, i64 b
2632 // then the store should really be marked as 'align 1', but is not. If it
2633 // were marked as 'align 1' then the indexed form would have been
2634 // instruction-selected initially, and the problem this 'fixup' is preventing
2635 // won't happen regardless.
2636 if (FrameIdx < 0)
2637 return;
2638
2639 MachineFunction &MF = DAG.getMachineFunction();
2640 MachineFrameInfo &MFI = MF.getFrameInfo();
2641
2642 if (MFI.getObjectAlign(ObjectIdx: FrameIdx) >= Align(4))
2643 return;
2644
2645 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
2646 FuncInfo->setHasNonRISpills();
2647}
2648
2649/// Returns true if the address N can be represented by a base register plus
2650/// a signed 16-bit displacement [r+imm], and if it is not better
2651/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
2652/// displacements that are multiples of that value.
2653bool PPCTargetLowering::SelectAddressRegImm(
2654 SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
2655 MaybeAlign EncodingAlignment) const {
2656 // FIXME dl should come from parent load or store, not from address
2657 SDLoc dl(N);
2658
2659 // If we have a PC Relative target flag don't select as [reg+imm]. It will be
2660 // a [pc+imm].
2661 if (SelectAddressPCRel(N, Base))
2662 return false;
2663
2664 // If this can be more profitably realized as r+r, fail.
2665 if (SelectAddressRegReg(N, Base&: Disp, Index&: Base, DAG, EncodingAlignment))
2666 return false;
2667
2668 if (N.getOpcode() == ISD::ADD) {
2669 int16_t imm = 0;
2670 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) &&
2671 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: imm))) {
2672 Disp = DAG.getSignedTargetConstant(Val: imm, DL: dl, VT: N.getValueType());
2673 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0))) {
2674 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2675 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2676 } else {
2677 Base = N.getOperand(i: 0);
2678 }
2679 return true; // [r+i]
2680 } else if (N.getOperand(i: 1).getOpcode() == PPCISD::Lo) {
2681 // Match LOAD (ADD (X, Lo(G))).
2682 assert(!N.getOperand(1).getConstantOperandVal(1) &&
2683 "Cannot handle constant offsets yet!");
2684 Disp = N.getOperand(i: 1).getOperand(i: 0); // The global address.
2685 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
2686 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
2687 Disp.getOpcode() == ISD::TargetConstantPool ||
2688 Disp.getOpcode() == ISD::TargetJumpTable);
2689 Base = N.getOperand(i: 0);
2690 return true; // [&g+r]
2691 }
2692 } else if (N.getOpcode() == ISD::OR) {
2693 int16_t imm = 0;
2694 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) &&
2695 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: imm))) {
2696 // If this is an or of disjoint bitfields, we can codegen this as an add
2697 // (for better address arithmetic) if the LHS and RHS of the OR are
2698 // provably disjoint.
2699 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2700
2701 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
2702 // If all of the bits are known zero on the LHS or RHS, the add won't
2703 // carry.
2704 if (FrameIndexSDNode *FI =
2705 dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0))) {
2706 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2707 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2708 } else {
2709 Base = N.getOperand(i: 0);
2710 }
2711 Disp = DAG.getTargetConstant(Val: imm, DL: dl, VT: N.getValueType());
2712 return true;
2713 }
2714 }
2715 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: N)) {
2716 // Loading from a constant address.
2717
2718 // If this address fits entirely in a 16-bit sext immediate field, codegen
2719 // this as "d, 0"
2720 int16_t Imm;
2721 if (isIntS16Immediate(N: CN, Imm) &&
2722 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm))) {
2723 Disp = DAG.getTargetConstant(Val: Imm, DL: dl, VT: CN->getValueType(ResNo: 0));
2724 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2725 VT: CN->getValueType(ResNo: 0));
2726 return true;
2727 }
2728
2729 // Handle 32-bit sext immediates with LIS + addr mode.
2730 if ((CN->getValueType(ResNo: 0) == MVT::i32 ||
2731 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
2732 (!EncodingAlignment ||
2733 isAligned(Lhs: *EncodingAlignment, SizeInBytes: CN->getZExtValue()))) {
2734 int Addr = (int)CN->getZExtValue();
2735
2736 // Otherwise, break this down into an LIS + disp.
2737 Disp = DAG.getTargetConstant(Val: (short)Addr, DL: dl, VT: MVT::i32);
2738
2739 Base = DAG.getTargetConstant(Val: (Addr - (signed short)Addr) >> 16, DL: dl,
2740 VT: MVT::i32);
2741 unsigned Opc = CN->getValueType(ResNo: 0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
2742 Base = SDValue(DAG.getMachineNode(Opcode: Opc, dl, VT: CN->getValueType(ResNo: 0), Op1: Base), 0);
2743 return true;
2744 }
2745 }
2746
2747 Disp = DAG.getTargetConstant(Val: 0, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()));
2748 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N)) {
2749 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2750 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2751 } else
2752 Base = N;
2753 return true; // [r+0]
2754}
2755
2756/// Similar to the 16-bit case but for instructions that take a 34-bit
2757/// displacement field (prefixed loads/stores).
2758bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
2759 SDValue &Base,
2760 SelectionDAG &DAG) const {
2761 // Only on 64-bit targets.
2762 if (N.getValueType() != MVT::i64)
2763 return false;
2764
2765 SDLoc dl(N);
2766 int64_t Imm = 0;
2767
2768 if (N.getOpcode() == ISD::ADD) {
2769 if (!isIntS34Immediate(Op: N.getOperand(i: 1), Imm))
2770 return false;
2771 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2772 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
2773 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2774 else
2775 Base = N.getOperand(i: 0);
2776 return true;
2777 }
2778
2779 if (N.getOpcode() == ISD::OR) {
2780 if (!isIntS34Immediate(Op: N.getOperand(i: 1), Imm))
2781 return false;
2782 // If this is an or of disjoint bitfields, we can codegen this as an add
2783 // (for better address arithmetic) if the LHS and RHS of the OR are
2784 // provably disjoint.
2785 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2786 if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
2787 return false;
2788 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
2789 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2790 else
2791 Base = N.getOperand(i: 0);
2792 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2793 return true;
2794 }
2795
2796 if (isIntS34Immediate(Op: N, Imm)) { // If the address is a 34-bit const.
2797 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2798 Base = DAG.getRegister(Reg: PPC::ZERO8, VT: N.getValueType());
2799 return true;
2800 }
2801
2802 return false;
2803}
2804
2805/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
2806/// represented as an indexed [r+r] operation.
2807bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
2808 SDValue &Index,
2809 SelectionDAG &DAG) const {
2810 // Check to see if we can easily represent this as an [r+r] address. This
2811 // will fail if it thinks that the address is more profitably represented as
2812 // reg+imm, e.g. where imm = 0.
2813 if (SelectAddressRegReg(N, Base, Index, DAG))
2814 return true;
2815
2816 // If the address is the result of an add, we will utilize the fact that the
2817 // address calculation includes an implicit add. However, we can reduce
2818 // register pressure if we do not materialize a constant just for use as the
2819 // index register. We only get rid of the add if it is not an add of a
2820 // value and a 16-bit signed constant and both have a single use.
2821 int16_t imm = 0;
2822 if (N.getOpcode() == ISD::ADD &&
2823 (!isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) ||
2824 !N.getOperand(i: 1).hasOneUse() || !N.getOperand(i: 0).hasOneUse())) {
2825 Base = N.getOperand(i: 0);
2826 Index = N.getOperand(i: 1);
2827 return true;
2828 }
2829
2830 // Otherwise, do it the hard way, using R0 as the base register.
2831 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2832 VT: N.getValueType());
2833 Index = N;
2834 return true;
2835}
2836
2837template <typename Ty> static bool isValidPCRelNode(SDValue N) {
2838 Ty *PCRelCand = dyn_cast<Ty>(N);
2839 return PCRelCand && (PPCInstrInfo::hasPCRelFlag(TF: PCRelCand->getTargetFlags()));
2840}
2841
2842/// Returns true if this address is a PC Relative address.
2843/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
2844/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
2845bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
2846 // This is a materialize PC Relative node. Always select this as PC Relative.
2847 Base = N;
2848 if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
2849 return true;
2850 if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
2851 isValidPCRelNode<GlobalAddressSDNode>(N) ||
2852 isValidPCRelNode<JumpTableSDNode>(N) ||
2853 isValidPCRelNode<BlockAddressSDNode>(N))
2854 return true;
2855 return false;
2856}
2857
2858/// Returns true if we should use a direct load into vector instruction
2859/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
2860static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
2861
2862 // If there are any other uses other than scalar to vector, then we should
2863 // keep it as a scalar load -> direct move pattern to prevent multiple
2864 // loads.
2865 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N);
2866 if (!LD)
2867 return false;
2868
2869 EVT MemVT = LD->getMemoryVT();
2870 if (!MemVT.isSimple())
2871 return false;
2872 switch(MemVT.getSimpleVT().SimpleTy) {
2873 case MVT::i64:
2874 break;
2875 case MVT::i32:
2876 if (!ST.hasP8Vector())
2877 return false;
2878 break;
2879 case MVT::i16:
2880 case MVT::i8:
2881 if (!ST.hasP9Vector())
2882 return false;
2883 break;
2884 default:
2885 return false;
2886 }
2887
2888 SDValue LoadedVal(N, 0);
2889 if (!LoadedVal.hasOneUse())
2890 return false;
2891
2892 for (SDUse &Use : LD->uses())
2893 if (Use.getResNo() == 0 &&
2894 Use.getUser()->getOpcode() != ISD::SCALAR_TO_VECTOR &&
2895 Use.getUser()->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
2896 return false;
2897
2898 return true;
2899}
2900
2901/// getPreIndexedAddressParts - returns true by value, base pointer and
2902/// offset pointer and addressing mode by reference if the node's address
2903/// can be legally represented as pre-indexed load / store address.
2904bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
2905 SDValue &Offset,
2906 ISD::MemIndexedMode &AM,
2907 SelectionDAG &DAG) const {
2908 if (DisablePPCPreinc) return false;
2909
2910 bool isLoad = true;
2911 SDValue Ptr;
2912 EVT VT;
2913 Align Alignment;
2914 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
2915 Ptr = LD->getBasePtr();
2916 VT = LD->getMemoryVT();
2917 Alignment = LD->getAlign();
2918 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
2919 Ptr = ST->getBasePtr();
2920 VT = ST->getMemoryVT();
2921 Alignment = ST->getAlign();
2922 isLoad = false;
2923 } else
2924 return false;
2925
2926 // Do not generate pre-inc forms for specific loads that feed scalar_to_vector
2927 // instructions because we can fold these into a more efficient instruction
2928 // instead, (such as LXSD).
2929 if (isLoad && usePartialVectorLoads(N, ST: Subtarget)) {
2930 return false;
2931 }
2932
2933 // PowerPC doesn't have preinc load/store instructions for vectors
2934 if (VT.isVector())
2935 return false;
2936
2937 if (SelectAddressRegReg(N: Ptr, Base, Index&: Offset, DAG)) {
2938 // Common code will reject creating a pre-inc form if the base pointer
2939 // is a frame index, or if N is a store and the base pointer is either
2940 // the same as or a predecessor of the value being stored. Check for
2941 // those situations here, and try with swapped Base/Offset instead.
2942 bool Swap = false;
2943
2944 if (isa<FrameIndexSDNode>(Val: Base) || isa<RegisterSDNode>(Val: Base))
2945 Swap = true;
2946 else if (!isLoad) {
2947 SDValue Val = cast<StoreSDNode>(Val: N)->getValue();
2948 if (Val == Base || Base.getNode()->isPredecessorOf(N: Val.getNode()))
2949 Swap = true;
2950 }
2951
2952 if (Swap)
2953 std::swap(a&: Base, b&: Offset);
2954
2955 AM = ISD::PRE_INC;
2956 return true;
2957 }
2958
2959 // LDU/STU can only handle immediates that are a multiple of 4.
2960 if (VT != MVT::i64) {
2961 if (!SelectAddressRegImm(N: Ptr, Disp&: Offset, Base, DAG, EncodingAlignment: std::nullopt))
2962 return false;
2963 } else {
2964 // LDU/STU need an address with at least 4-byte alignment.
2965 if (Alignment < Align(4))
2966 return false;
2967
2968 if (!SelectAddressRegImm(N: Ptr, Disp&: Offset, Base, DAG, EncodingAlignment: Align(4)))
2969 return false;
2970 }
2971
2972 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
2973 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
2974 // sext i32 to i64 when addr mode is r+i.
2975 if (LD->getValueType(ResNo: 0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
2976 LD->getExtensionType() == ISD::SEXTLOAD &&
2977 isa<ConstantSDNode>(Val: Offset))
2978 return false;
2979 }
2980
2981 AM = ISD::PRE_INC;
2982 return true;
2983}
2984
2985//===----------------------------------------------------------------------===//
2986// LowerOperation implementation
2987//===----------------------------------------------------------------------===//
2988
2989/// Return true if we should reference labels using a PICBase, set the HiOpFlags
2990/// and LoOpFlags to the target MO flags.
2991static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
2992 unsigned &HiOpFlags, unsigned &LoOpFlags,
2993 const GlobalValue *GV = nullptr) {
2994 HiOpFlags = PPCII::MO_HA;
2995 LoOpFlags = PPCII::MO_LO;
2996
2997 // Don't use the pic base if not in PIC relocation model.
2998 if (IsPIC) {
2999 HiOpFlags = PPCII::MO_PIC_HA_FLAG;
3000 LoOpFlags = PPCII::MO_PIC_LO_FLAG;
3001 }
3002}
3003
3004static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
3005 SelectionDAG &DAG) {
3006 SDLoc DL(HiPart);
3007 EVT PtrVT = HiPart.getValueType();
3008 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: PtrVT);
3009
3010 SDValue Hi = DAG.getNode(Opcode: PPCISD::Hi, DL, VT: PtrVT, N1: HiPart, N2: Zero);
3011 SDValue Lo = DAG.getNode(Opcode: PPCISD::Lo, DL, VT: PtrVT, N1: LoPart, N2: Zero);
3012
3013 // With PIC, the first instruction is actually "GR+hi(&G)".
3014 if (isPIC)
3015 Hi = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT,
3016 N1: DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL, VT: PtrVT), N2: Hi);
3017
3018 // Generate non-pic code that has direct accesses to the constant pool.
3019 // The address of the global is just (hi(&g)+lo(&g)).
3020 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Hi, N2: Lo);
3021}
3022
3023static void setUsesTOCBasePtr(MachineFunction &MF) {
3024 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3025 FuncInfo->setUsesTOCBasePtr();
3026}
3027
3028static void setUsesTOCBasePtr(SelectionDAG &DAG) {
3029 setUsesTOCBasePtr(DAG.getMachineFunction());
3030}
3031
3032SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
3033 SDValue GA) const {
3034 EVT VT = Subtarget.getScalarIntVT();
3035 SDValue Reg = Subtarget.isPPC64() ? DAG.getRegister(Reg: PPC::X2, VT)
3036 : Subtarget.isAIXABI()
3037 ? DAG.getRegister(Reg: PPC::R2, VT)
3038 : DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT);
3039 SDValue Ops[] = { GA, Reg };
3040 return DAG.getMemIntrinsicNode(
3041 Opcode: PPCISD::TOC_ENTRY, dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Other), Ops, MemVT: VT,
3042 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()), Alignment: std::nullopt,
3043 Flags: MachineMemOperand::MOLoad);
3044}
3045
3046SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
3047 SelectionDAG &DAG) const {
3048 EVT PtrVT = Op.getValueType();
3049 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
3050 const Constant *C = CP->getConstVal();
3051
3052 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3053 // The actual address of the GlobalValue is stored in the TOC.
3054 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3055 if (Subtarget.isUsingPCRelativeCalls()) {
3056 SDLoc DL(CP);
3057 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3058 SDValue ConstPool = DAG.getTargetConstantPool(
3059 C, VT: Ty, Align: CP->getAlign(), Offset: CP->getOffset(), TargetFlags: PPCII::MO_PCREL_FLAG);
3060 return DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: ConstPool);
3061 }
3062 setUsesTOCBasePtr(DAG);
3063 SDValue GA = DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0);
3064 return getTOCEntry(DAG, dl: SDLoc(CP), GA);
3065 }
3066
3067 unsigned MOHiFlag, MOLoFlag;
3068 bool IsPIC = isPositionIndependent();
3069 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3070
3071 if (IsPIC && Subtarget.isSVR4ABI()) {
3072 SDValue GA =
3073 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: PPCII::MO_PIC_FLAG);
3074 return getTOCEntry(DAG, dl: SDLoc(CP), GA);
3075 }
3076
3077 SDValue CPIHi =
3078 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0, TargetFlags: MOHiFlag);
3079 SDValue CPILo =
3080 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0, TargetFlags: MOLoFlag);
3081 return LowerLabelRef(HiPart: CPIHi, LoPart: CPILo, isPIC: IsPIC, DAG);
3082}
3083
3084// For 64-bit PowerPC, prefer the more compact relative encodings.
3085// This trades 32 bits per jump table entry for one or two instructions
3086// on the jump site.
3087unsigned PPCTargetLowering::getJumpTableEncoding() const {
3088 if (isJumpTableRelative())
3089 return MachineJumpTableInfo::EK_LabelDifference32;
3090
3091 return TargetLowering::getJumpTableEncoding();
3092}
3093
3094bool PPCTargetLowering::isJumpTableRelative() const {
3095 if (UseAbsoluteJumpTables)
3096 return false;
3097 if (Subtarget.isPPC64() || Subtarget.isAIXABI())
3098 return true;
3099 return TargetLowering::isJumpTableRelative();
3100}
3101
3102SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
3103 SelectionDAG &DAG) const {
3104 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3105 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3106
3107 switch (getTargetMachine().getCodeModel()) {
3108 case CodeModel::Small:
3109 case CodeModel::Medium:
3110 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3111 default:
3112 return DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: SDLoc(),
3113 VT: getPointerTy(DL: DAG.getDataLayout()));
3114 }
3115}
3116
3117const MCExpr *
3118PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
3119 unsigned JTI,
3120 MCContext &Ctx) const {
3121 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3122 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3123
3124 switch (getTargetMachine().getCodeModel()) {
3125 case CodeModel::Small:
3126 case CodeModel::Medium:
3127 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3128 default:
3129 return MCSymbolRefExpr::create(Symbol: MF->getPICBaseSymbol(), Ctx);
3130 }
3131}
3132
3133SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
3134 EVT PtrVT = Op.getValueType();
3135 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
3136
3137 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3138 if (Subtarget.isUsingPCRelativeCalls()) {
3139 SDLoc DL(JT);
3140 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3141 SDValue GA =
3142 DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: Ty, TargetFlags: PPCII::MO_PCREL_FLAG);
3143 SDValue MatAddr = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3144 return MatAddr;
3145 }
3146
3147 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3148 // The actual address of the GlobalValue is stored in the TOC.
3149 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3150 setUsesTOCBasePtr(DAG);
3151 SDValue GA = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT);
3152 return getTOCEntry(DAG, dl: SDLoc(JT), GA);
3153 }
3154
3155 unsigned MOHiFlag, MOLoFlag;
3156 bool IsPIC = isPositionIndependent();
3157 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3158
3159 if (IsPIC && Subtarget.isSVR4ABI()) {
3160 SDValue GA = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT,
3161 TargetFlags: PPCII::MO_PIC_FLAG);
3162 return getTOCEntry(DAG, dl: SDLoc(GA), GA);
3163 }
3164
3165 SDValue JTIHi = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT, TargetFlags: MOHiFlag);
3166 SDValue JTILo = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT, TargetFlags: MOLoFlag);
3167 return LowerLabelRef(HiPart: JTIHi, LoPart: JTILo, isPIC: IsPIC, DAG);
3168}
3169
3170SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
3171 SelectionDAG &DAG) const {
3172 EVT PtrVT = Op.getValueType();
3173 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Val&: Op);
3174 const BlockAddress *BA = BASDN->getBlockAddress();
3175
3176 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3177 if (Subtarget.isUsingPCRelativeCalls()) {
3178 SDLoc DL(BASDN);
3179 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3180 SDValue GA = DAG.getTargetBlockAddress(BA, VT: Ty, Offset: BASDN->getOffset(),
3181 TargetFlags: PPCII::MO_PCREL_FLAG);
3182 SDValue MatAddr = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3183 return MatAddr;
3184 }
3185
3186 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3187 // The actual BlockAddress is stored in the TOC.
3188 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3189 setUsesTOCBasePtr(DAG);
3190 SDValue GA = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: BASDN->getOffset());
3191 return getTOCEntry(DAG, dl: SDLoc(BASDN), GA);
3192 }
3193
3194 // 32-bit position-independent ELF stores the BlockAddress in the .got.
3195 if (Subtarget.is32BitELFABI() && isPositionIndependent())
3196 return getTOCEntry(
3197 DAG, dl: SDLoc(BASDN),
3198 GA: DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: BASDN->getOffset()));
3199
3200 unsigned MOHiFlag, MOLoFlag;
3201 bool IsPIC = isPositionIndependent();
3202 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3203 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: 0, TargetFlags: MOHiFlag);
3204 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: 0, TargetFlags: MOLoFlag);
3205 return LowerLabelRef(HiPart: TgtBAHi, LoPart: TgtBALo, isPIC: IsPIC, DAG);
3206}
3207
3208SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
3209 SelectionDAG &DAG) const {
3210 if (Subtarget.isAIXABI())
3211 return LowerGlobalTLSAddressAIX(Op, DAG);
3212
3213 return LowerGlobalTLSAddressLinux(Op, DAG);
3214}
3215
3216/// updateForAIXShLibTLSModelOpt - Helper to initialize TLS model opt settings,
3217/// and then apply the update.
3218static void updateForAIXShLibTLSModelOpt(TLSModel::Model &Model,
3219 SelectionDAG &DAG,
3220 const TargetMachine &TM) {
3221 // Initialize TLS model opt setting lazily:
3222 // (1) Use initial-exec for single TLS var references within current function.
3223 // (2) Use local-dynamic for multiple TLS var references within current
3224 // function.
3225 PPCFunctionInfo *FuncInfo =
3226 DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
3227 if (!FuncInfo->isAIXFuncTLSModelOptInitDone()) {
3228 SmallPtrSet<const GlobalValue *, 8> TLSGV;
3229 // Iterate over all instructions within current function, collect all TLS
3230 // global variables (global variables taken as the first parameter to
3231 // Intrinsic::threadlocal_address).
3232 const Function &Func = DAG.getMachineFunction().getFunction();
3233 for (const BasicBlock &BB : Func)
3234 for (const Instruction &I : BB)
3235 if (I.getOpcode() == Instruction::Call)
3236 if (const CallInst *CI = dyn_cast<const CallInst>(Val: &I))
3237 if (Function *CF = CI->getCalledFunction())
3238 if (CF->isDeclaration() &&
3239 CF->getIntrinsicID() == Intrinsic::threadlocal_address)
3240 if (const GlobalValue *GV =
3241 dyn_cast<GlobalValue>(Val: I.getOperand(i: 0))) {
3242 TLSModel::Model GVModel = TM.getTLSModel(GV);
3243 if (GVModel == TLSModel::LocalDynamic)
3244 TLSGV.insert(Ptr: GV);
3245 }
3246
3247 unsigned TLSGVCnt = TLSGV.size();
3248 LLVM_DEBUG(dbgs() << format("LocalDynamic TLSGV count:%d\n", TLSGVCnt));
3249 if (TLSGVCnt <= PPCAIXTLSModelOptUseIEForLDLimit)
3250 FuncInfo->setAIXFuncUseTLSIEForLD();
3251 FuncInfo->setAIXFuncTLSModelOptInitDone();
3252 }
3253
3254 if (FuncInfo->isAIXFuncUseTLSIEForLD()) {
3255 LLVM_DEBUG(
3256 dbgs() << DAG.getMachineFunction().getName()
3257 << " function is using the TLS-IE model for TLS-LD access.\n");
3258 Model = TLSModel::InitialExec;
3259 }
3260}
3261
3262SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
3263 SelectionDAG &DAG) const {
3264 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3265
3266 if (DAG.getTarget().useEmulatedTLS())
3267 report_fatal_error(reason: "Emulated TLS is not yet supported on AIX");
3268
3269 SDLoc dl(GA);
3270 const GlobalValue *GV = GA->getGlobal();
3271 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3272 bool Is64Bit = Subtarget.isPPC64();
3273 TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
3274
3275 // Apply update to the TLS model.
3276 if (Subtarget.hasAIXShLibTLSModelOpt())
3277 updateForAIXShLibTLSModelOpt(Model, DAG, TM: getTargetMachine());
3278
3279 // TLS variables are accessed through TOC entries.
3280 // To support this, set the DAG to use the TOC base pointer.
3281 setUsesTOCBasePtr(DAG);
3282
3283 bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
3284
3285 if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
3286 bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
3287 bool HasAIXSmallTLSGlobalAttr = false;
3288 SDValue VariableOffsetTGA =
3289 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TPREL_FLAG);
3290 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3291 SDValue TLSReg;
3292
3293 if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(Val: GV))
3294 if (GVar->hasAttribute(Kind: "aix-small-tls"))
3295 HasAIXSmallTLSGlobalAttr = true;
3296
3297 if (Is64Bit) {
3298 // For local-exec and initial-exec on AIX (64-bit), the sequence generated
3299 // involves a load of the variable offset (from the TOC), followed by an
3300 // add of the loaded variable offset to R13 (the thread pointer).
3301 // This code sequence looks like:
3302 // ld reg1,var[TC](2)
3303 // add reg2, reg1, r13 // r13 contains the thread pointer
3304 TLSReg = DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
3305
3306 // With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
3307 // global variable attribute, produce a faster access sequence for
3308 // local-exec TLS variables where the offset from the TLS base is encoded
3309 // as an immediate operand.
3310 //
3311 // We only utilize the faster local-exec access sequence when the TLS
3312 // variable has a size within the policy limit. We treat types that are
3313 // not sized or are empty as being over the policy size limit.
3314 if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
3315 IsTLSLocalExecModel) {
3316 Type *GVType = GV->getValueType();
3317 if (GVType->isSized() && !GVType->isEmptyTy() &&
3318 GV->getDataLayout().getTypeAllocSize(Ty: GVType) <=
3319 AIXSmallTlsPolicySizeLimit)
3320 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: VariableOffsetTGA, N2: TLSReg);
3321 }
3322 } else {
3323 // For local-exec and initial-exec on AIX (32-bit), the sequence generated
3324 // involves loading the variable offset from the TOC, generating a call to
3325 // .__get_tpointer to get the thread pointer (which will be in R3), and
3326 // adding the two together:
3327 // lwz reg1,var[TC](2)
3328 // bla .__get_tpointer
3329 // add reg2, reg1, r3
3330 TLSReg = DAG.getNode(Opcode: PPCISD::GET_TPOINTER, DL: dl, VT: PtrVT);
3331
3332 // We do not implement the 32-bit version of the faster access sequence
3333 // for local-exec that is controlled by the -maix-small-local-exec-tls
3334 // option, or the "aix-small-tls" global variable attribute.
3335 if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
3336 report_fatal_error(reason: "The small-local-exec TLS access sequence is "
3337 "currently only supported on AIX (64-bit mode).");
3338 }
3339 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TLSReg, N2: VariableOffset);
3340 }
3341
3342 if (Model == TLSModel::LocalDynamic) {
3343 bool HasAIXSmallLocalDynamicTLS = Subtarget.hasAIXSmallLocalDynamicTLS();
3344
3345 // We do not implement the 32-bit version of the faster access sequence
3346 // for local-dynamic that is controlled by -maix-small-local-dynamic-tls.
3347 if (!Is64Bit && HasAIXSmallLocalDynamicTLS)
3348 report_fatal_error(reason: "The small-local-dynamic TLS access sequence is "
3349 "currently only supported on AIX (64-bit mode).");
3350
3351 // For local-dynamic on AIX, we need to generate one TOC entry for each
3352 // variable offset, and a single module-handle TOC entry for the entire
3353 // file.
3354
3355 SDValue VariableOffsetTGA =
3356 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSLD_FLAG);
3357 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3358
3359 Module *M = DAG.getMachineFunction().getFunction().getParent();
3360 GlobalVariable *TLSGV =
3361 dyn_cast_or_null<GlobalVariable>(Val: M->getOrInsertGlobal(
3362 Name: StringRef("_$TLSML"), Ty: PointerType::getUnqual(C&: *DAG.getContext())));
3363 TLSGV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);
3364 assert(TLSGV && "Not able to create GV for _$TLSML.");
3365 SDValue ModuleHandleTGA =
3366 DAG.getTargetGlobalAddress(GV: TLSGV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSLDM_FLAG);
3367 SDValue ModuleHandleTOC = getTOCEntry(DAG, dl, GA: ModuleHandleTGA);
3368 SDValue ModuleHandle =
3369 DAG.getNode(Opcode: PPCISD::TLSLD_AIX, DL: dl, VT: PtrVT, Operand: ModuleHandleTOC);
3370
3371 // With the -maix-small-local-dynamic-tls option, produce a faster access
3372 // sequence for local-dynamic TLS variables where the offset from the
3373 // module-handle is encoded as an immediate operand.
3374 //
3375 // We only utilize the faster local-dynamic access sequence when the TLS
3376 // variable has a size within the policy limit. We treat types that are
3377 // not sized or are empty as being over the policy size limit.
3378 if (HasAIXSmallLocalDynamicTLS) {
3379 Type *GVType = GV->getValueType();
3380 if (GVType->isSized() && !GVType->isEmptyTy() &&
3381 GV->getDataLayout().getTypeAllocSize(Ty: GVType) <=
3382 AIXSmallTlsPolicySizeLimit)
3383 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: VariableOffsetTGA,
3384 N2: ModuleHandle);
3385 }
3386
3387 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: ModuleHandle, N2: VariableOffset);
3388 }
3389
3390 // If Local- or Initial-exec or Local-dynamic is not possible or specified,
3391 // all GlobalTLSAddress nodes are lowered using the general-dynamic model. We
3392 // need to generate two TOC entries, one for the variable offset, one for the
3393 // region handle. The global address for the TOC entry of the region handle is
3394 // created with the MO_TLSGDM_FLAG flag and the global address for the TOC
3395 // entry of the variable offset is created with MO_TLSGD_FLAG.
3396 SDValue VariableOffsetTGA =
3397 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSGD_FLAG);
3398 SDValue RegionHandleTGA =
3399 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSGDM_FLAG);
3400 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3401 SDValue RegionHandle = getTOCEntry(DAG, dl, GA: RegionHandleTGA);
3402 return DAG.getNode(Opcode: PPCISD::TLSGD_AIX, DL: dl, VT: PtrVT, N1: VariableOffset,
3403 N2: RegionHandle);
3404}
3405
3406SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
3407 SelectionDAG &DAG) const {
3408 // FIXME: TLS addresses currently use medium model code sequences,
3409 // which is the most useful form. Eventually support for small and
3410 // large models could be added if users need it, at the cost of
3411 // additional complexity.
3412 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3413 if (DAG.getTarget().useEmulatedTLS())
3414 return LowerToTLSEmulatedModel(GA, DAG);
3415
3416 SDLoc dl(GA);
3417 const GlobalValue *GV = GA->getGlobal();
3418 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3419 bool is64bit = Subtarget.isPPC64();
3420 const Module *M = DAG.getMachineFunction().getFunction().getParent();
3421 PICLevel::Level picLevel = M->getPICLevel();
3422
3423 const TargetMachine &TM = getTargetMachine();
3424 TLSModel::Model Model = TM.getTLSModel(GV);
3425
3426 if (Model == TLSModel::LocalExec) {
3427 if (Subtarget.isUsingPCRelativeCalls()) {
3428 SDValue TLSReg = DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
3429 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3430 TargetFlags: PPCII::MO_TPREL_PCREL_FLAG);
3431 SDValue MatAddr =
3432 DAG.getNode(Opcode: PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3433 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TLSReg, N2: MatAddr);
3434 }
3435
3436 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3437 TargetFlags: PPCII::MO_TPREL_HA);
3438 SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3439 TargetFlags: PPCII::MO_TPREL_LO);
3440 SDValue TLSReg = is64bit ? DAG.getRegister(Reg: PPC::X13, VT: MVT::i64)
3441 : DAG.getRegister(Reg: PPC::R2, VT: MVT::i32);
3442
3443 SDValue Hi = DAG.getNode(Opcode: PPCISD::Hi, DL: dl, VT: PtrVT, N1: TGAHi, N2: TLSReg);
3444 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: TGALo, N2: Hi);
3445 }
3446
3447 if (Model == TLSModel::InitialExec) {
3448 bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
3449 SDValue TGA = DAG.getTargetGlobalAddress(
3450 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
3451 SDValue TGATLS = DAG.getTargetGlobalAddress(
3452 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS);
3453 SDValue TPOffset;
3454 if (IsPCRel) {
3455 SDValue MatPCRel = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3456 TPOffset = DAG.getLoad(VT: MVT::i64, dl, Chain: DAG.getEntryNode(), Ptr: MatPCRel,
3457 PtrInfo: MachinePointerInfo());
3458 } else {
3459 SDValue GOTPtr;
3460 if (is64bit) {
3461 setUsesTOCBasePtr(DAG);
3462 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3463 GOTPtr =
3464 DAG.getNode(Opcode: PPCISD::ADDIS_GOT_TPREL_HA, DL: dl, VT: PtrVT, N1: GOTReg, N2: TGA);
3465 } else {
3466 if (!TM.isPositionIndependent())
3467 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_GOT, DL: dl, VT: PtrVT);
3468 else if (picLevel == PICLevel::SmallPIC)
3469 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3470 else
3471 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3472 }
3473 TPOffset = DAG.getNode(Opcode: PPCISD::LD_GOT_TPREL_L, DL: dl, VT: PtrVT, N1: TGA, N2: GOTPtr);
3474 }
3475 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TPOffset, N2: TGATLS);
3476 }
3477
3478 if (Model == TLSModel::GeneralDynamic) {
3479 if (Subtarget.isUsingPCRelativeCalls()) {
3480 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3481 TargetFlags: PPCII::MO_GOT_TLSGD_PCREL_FLAG);
3482 return DAG.getNode(Opcode: PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3483 }
3484
3485 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: 0);
3486 SDValue GOTPtr;
3487 if (is64bit) {
3488 setUsesTOCBasePtr(DAG);
3489 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3490 GOTPtr = DAG.getNode(Opcode: PPCISD::ADDIS_TLSGD_HA, DL: dl, VT: PtrVT,
3491 N1: GOTReg, N2: TGA);
3492 } else {
3493 if (picLevel == PICLevel::SmallPIC)
3494 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3495 else
3496 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3497 }
3498 return DAG.getNode(Opcode: PPCISD::ADDI_TLSGD_L_ADDR, DL: dl, VT: PtrVT,
3499 N1: GOTPtr, N2: TGA, N3: TGA);
3500 }
3501
3502 if (Model == TLSModel::LocalDynamic) {
3503 if (Subtarget.isUsingPCRelativeCalls()) {
3504 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3505 TargetFlags: PPCII::MO_GOT_TLSLD_PCREL_FLAG);
3506 SDValue MatPCRel =
3507 DAG.getNode(Opcode: PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3508 return DAG.getNode(Opcode: PPCISD::PADDI_DTPREL, DL: dl, VT: PtrVT, N1: MatPCRel, N2: TGA);
3509 }
3510
3511 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: 0);
3512 SDValue GOTPtr;
3513 if (is64bit) {
3514 setUsesTOCBasePtr(DAG);
3515 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3516 GOTPtr = DAG.getNode(Opcode: PPCISD::ADDIS_TLSLD_HA, DL: dl, VT: PtrVT,
3517 N1: GOTReg, N2: TGA);
3518 } else {
3519 if (picLevel == PICLevel::SmallPIC)
3520 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3521 else
3522 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3523 }
3524 SDValue TLSAddr = DAG.getNode(Opcode: PPCISD::ADDI_TLSLD_L_ADDR, DL: dl,
3525 VT: PtrVT, N1: GOTPtr, N2: TGA, N3: TGA);
3526 SDValue DtvOffsetHi = DAG.getNode(Opcode: PPCISD::ADDIS_DTPREL_HA, DL: dl,
3527 VT: PtrVT, N1: TLSAddr, N2: TGA);
3528 return DAG.getNode(Opcode: PPCISD::ADDI_DTPREL_L, DL: dl, VT: PtrVT, N1: DtvOffsetHi, N2: TGA);
3529 }
3530
3531 llvm_unreachable("Unknown TLS model!");
3532}
3533
3534SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
3535 SelectionDAG &DAG) const {
3536 EVT PtrVT = Op.getValueType();
3537 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Val&: Op);
3538 SDLoc DL(GSDN);
3539 const GlobalValue *GV = GSDN->getGlobal();
3540
3541 // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
3542 // The actual address of the GlobalValue is stored in the TOC.
3543 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3544 if (Subtarget.isUsingPCRelativeCalls()) {
3545 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3546 if (isAccessedAsGotIndirect(N: Op)) {
3547 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: Ty, offset: GSDN->getOffset(),
3548 TargetFlags: PPCII::MO_GOT_PCREL_FLAG);
3549 SDValue MatPCRel = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3550 SDValue Load = DAG.getLoad(VT: MVT::i64, dl: DL, Chain: DAG.getEntryNode(), Ptr: MatPCRel,
3551 PtrInfo: MachinePointerInfo());
3552 return Load;
3553 } else {
3554 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: Ty, offset: GSDN->getOffset(),
3555 TargetFlags: PPCII::MO_PCREL_FLAG);
3556 return DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3557 }
3558 }
3559 setUsesTOCBasePtr(DAG);
3560 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset());
3561 return getTOCEntry(DAG, dl: DL, GA);
3562 }
3563
3564 unsigned MOHiFlag, MOLoFlag;
3565 bool IsPIC = isPositionIndependent();
3566 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag, GV);
3567
3568 if (IsPIC && Subtarget.isSVR4ABI()) {
3569 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT,
3570 offset: GSDN->getOffset(),
3571 TargetFlags: PPCII::MO_PIC_FLAG);
3572 return getTOCEntry(DAG, dl: DL, GA);
3573 }
3574
3575 SDValue GAHi =
3576 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset(), TargetFlags: MOHiFlag);
3577 SDValue GALo =
3578 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset(), TargetFlags: MOLoFlag);
3579
3580 return LowerLabelRef(HiPart: GAHi, LoPart: GALo, isPIC: IsPIC, DAG);
3581}
3582
3583SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
3584 bool IsStrict = Op->isStrictFPOpcode();
3585 ISD::CondCode CC =
3586 cast<CondCodeSDNode>(Val: Op.getOperand(i: IsStrict ? 3 : 2))->get();
3587 SDValue LHS = Op.getOperand(i: IsStrict ? 1 : 0);
3588 SDValue RHS = Op.getOperand(i: IsStrict ? 2 : 1);
3589 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
3590 EVT LHSVT = LHS.getValueType();
3591 SDLoc dl(Op);
3592
3593 // Soften the setcc with libcall if it is fp128.
3594 if (LHSVT == MVT::f128) {
3595 assert(!Subtarget.hasP9Vector() &&
3596 "SETCC for f128 is already legal under Power9!");
3597 softenSetCCOperands(DAG, VT: LHSVT, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS, Chain,
3598 IsSignaling: Op->getOpcode() == ISD::STRICT_FSETCCS);
3599 if (RHS.getNode())
3600 LHS = DAG.getNode(Opcode: ISD::SETCC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS,
3601 N3: DAG.getCondCode(Cond: CC));
3602 if (IsStrict)
3603 return DAG.getMergeValues(Ops: {LHS, Chain}, dl);
3604 return LHS;
3605 }
3606
3607 assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
3608
3609 if (Op.getValueType() == MVT::v2i64) {
3610 // When the operands themselves are v2i64 values, we need to do something
3611 // special because VSX has no underlying comparison operations for these.
3612 if (LHS.getValueType() == MVT::v2i64) {
3613 // Equality can be handled by casting to the legal type for Altivec
3614 // comparisons, everything else needs to be expanded.
3615 if (CC != ISD::SETEQ && CC != ISD::SETNE)
3616 return SDValue();
3617 SDValue SetCC32 = DAG.getSetCC(
3618 DL: dl, VT: MVT::v4i32, LHS: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: LHS),
3619 RHS: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: RHS), Cond: CC);
3620 int ShuffV[] = {1, 0, 3, 2};
3621 SDValue Shuff =
3622 DAG.getVectorShuffle(VT: MVT::v4i32, dl, N1: SetCC32, N2: SetCC32, Mask: ShuffV);
3623 return DAG.getBitcast(VT: MVT::v2i64,
3624 V: DAG.getNode(Opcode: CC == ISD::SETEQ ? ISD::AND : ISD::OR,
3625 DL: dl, VT: MVT::v4i32, N1: Shuff, N2: SetCC32));
3626 }
3627
3628 // We handle most of these in the usual way.
3629 return Op;
3630 }
3631
3632 // If we're comparing for equality to zero, expose the fact that this is
3633 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
3634 // fold the new nodes.
3635 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
3636 return V;
3637
3638 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3639 // Leave comparisons against 0 and -1 alone for now, since they're usually
3640 // optimized. FIXME: revisit this when we can custom lower all setcc
3641 // optimizations.
3642 if (C->isAllOnes() || C->isZero())
3643 return SDValue();
3644 }
3645
3646 // If we have an integer seteq/setne, turn it into a compare against zero
3647 // by xor'ing the rhs with the lhs, which is faster than setting a
3648 // condition register, reading it back out, and masking the correct bit. The
3649 // normal approach here uses sub to do this instead of xor. Using xor exposes
3650 // the result to other bit-twiddling opportunities.
3651 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3652 EVT VT = Op.getValueType();
3653 SDValue Sub = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: LHSVT, N1: LHS, N2: RHS);
3654 return DAG.getSetCC(DL: dl, VT, LHS: Sub, RHS: DAG.getConstant(Val: 0, DL: dl, VT: LHSVT), Cond: CC);
3655 }
3656 return SDValue();
3657}
3658
3659SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3660 SDNode *Node = Op.getNode();
3661 EVT VT = Node->getValueType(ResNo: 0);
3662 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3663 SDValue InChain = Node->getOperand(Num: 0);
3664 SDValue VAListPtr = Node->getOperand(Num: 1);
3665 const Value *SV = cast<SrcValueSDNode>(Val: Node->getOperand(Num: 2))->getValue();
3666 SDLoc dl(Node);
3667
3668 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
3669
3670 // gpr_index
3671 SDValue GprIndex = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i32, Chain: InChain,
3672 Ptr: VAListPtr, PtrInfo: MachinePointerInfo(SV), MemVT: MVT::i8);
3673 InChain = GprIndex.getValue(R: 1);
3674
3675 if (VT == MVT::i64) {
3676 // Check if GprIndex is even
3677 SDValue GprAnd = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: GprIndex,
3678 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3679 SDValue CC64 = DAG.getSetCC(DL: dl, VT: MVT::i32, LHS: GprAnd,
3680 RHS: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32), Cond: ISD::SETNE);
3681 SDValue GprIndexPlusOne = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: GprIndex,
3682 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3683 // Align GprIndex to be even if it isn't
3684 GprIndex = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i32, N1: CC64, N2: GprIndexPlusOne,
3685 N3: GprIndex);
3686 }
3687
3688 // fpr index is 1 byte after gpr
3689 SDValue FprPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3690 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3691
3692 // fpr
3693 SDValue FprIndex = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i32, Chain: InChain,
3694 Ptr: FprPtr, PtrInfo: MachinePointerInfo(SV), MemVT: MVT::i8);
3695 InChain = FprIndex.getValue(R: 1);
3696
3697 SDValue RegSaveAreaPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3698 N2: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32));
3699
3700 SDValue OverflowAreaPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3701 N2: DAG.getConstant(Val: 4, DL: dl, VT: MVT::i32));
3702
3703 // areas
3704 SDValue OverflowArea =
3705 DAG.getLoad(VT: MVT::i32, dl, Chain: InChain, Ptr: OverflowAreaPtr, PtrInfo: MachinePointerInfo());
3706 InChain = OverflowArea.getValue(R: 1);
3707
3708 SDValue RegSaveArea =
3709 DAG.getLoad(VT: MVT::i32, dl, Chain: InChain, Ptr: RegSaveAreaPtr, PtrInfo: MachinePointerInfo());
3710 InChain = RegSaveArea.getValue(R: 1);
3711
3712 // select overflow_area if index > 8
3713 SDValue CC = DAG.getSetCC(DL: dl, VT: MVT::i32, LHS: VT.isInteger() ? GprIndex : FprIndex,
3714 RHS: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32), Cond: ISD::SETLT);
3715
3716 // adjustment constant gpr_index * 4/8
3717 SDValue RegConstant = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT: MVT::i32,
3718 N1: VT.isInteger() ? GprIndex : FprIndex,
3719 N2: DAG.getConstant(Val: VT.isInteger() ? 4 : 8, DL: dl,
3720 VT: MVT::i32));
3721
3722 // OurReg = RegSaveArea + RegConstant
3723 SDValue OurReg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: RegSaveArea,
3724 N2: RegConstant);
3725
3726 // Floating types are 32 bytes into RegSaveArea
3727 if (VT.isFloatingPoint())
3728 OurReg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: OurReg,
3729 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
3730
3731 // increase {f,g}pr_index by 1 (or 2 if VT is i64)
3732 SDValue IndexPlus1 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32,
3733 N1: VT.isInteger() ? GprIndex : FprIndex,
3734 N2: DAG.getConstant(Val: VT == MVT::i64 ? 2 : 1, DL: dl,
3735 VT: MVT::i32));
3736
3737 InChain = DAG.getTruncStore(Chain: InChain, dl, Val: IndexPlus1,
3738 Ptr: VT.isInteger() ? VAListPtr : FprPtr,
3739 PtrInfo: MachinePointerInfo(SV), SVT: MVT::i8);
3740
3741 // determine if we should load from reg_save_area or overflow_area
3742 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: PtrVT, N1: CC, N2: OurReg, N3: OverflowArea);
3743
3744 // increase overflow_area by 4/8 if gpr/fpr > 8
3745 SDValue OverflowAreaPlusN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: OverflowArea,
3746 N2: DAG.getConstant(Val: VT.isInteger() ? 4 : 8,
3747 DL: dl, VT: MVT::i32));
3748
3749 OverflowArea = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i32, N1: CC, N2: OverflowArea,
3750 N3: OverflowAreaPlusN);
3751
3752 InChain = DAG.getTruncStore(Chain: InChain, dl, Val: OverflowArea, Ptr: OverflowAreaPtr,
3753 PtrInfo: MachinePointerInfo(), SVT: MVT::i32);
3754
3755 return DAG.getLoad(VT, dl, Chain: InChain, Ptr: Result, PtrInfo: MachinePointerInfo());
3756}
3757
3758SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
3759 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
3760
3761 // We have to copy the entire va_list struct:
3762 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
3763 return DAG.getMemcpy(Chain: Op.getOperand(i: 0), dl: Op, Dst: Op.getOperand(i: 1), Src: Op.getOperand(i: 2),
3764 Size: DAG.getConstant(Val: 12, DL: SDLoc(Op), VT: MVT::i32), Alignment: Align(8),
3765 isVol: false, AlwaysInline: true, /*CI=*/nullptr, OverrideTailCall: std::nullopt,
3766 DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
3767}
3768
3769SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
3770 SelectionDAG &DAG) const {
3771 return Op.getOperand(i: 0);
3772}
3773
3774SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
3775 MachineFunction &MF = DAG.getMachineFunction();
3776 PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
3777
3778 assert((Op.getOpcode() == ISD::INLINEASM ||
3779 Op.getOpcode() == ISD::INLINEASM_BR) &&
3780 "Expecting Inline ASM node.");
3781
3782 // If an LR store is already known to be required then there is not point in
3783 // checking this ASM as well.
3784 if (MFI.isLRStoreRequired())
3785 return Op;
3786
3787 // Inline ASM nodes have an optional last operand that is an incoming Flag of
3788 // type MVT::Glue. We want to ignore this last operand if that is the case.
3789 unsigned NumOps = Op.getNumOperands();
3790 if (Op.getOperand(i: NumOps - 1).getValueType() == MVT::Glue)
3791 --NumOps;
3792
3793 // Check all operands that may contain the LR.
3794 for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
3795 const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
3796 unsigned NumVals = Flags.getNumOperandRegisters();
3797 ++i; // Skip the ID value.
3798
3799 switch (Flags.getKind()) {
3800 default:
3801 llvm_unreachable("Bad flags!");
3802 case InlineAsm::Kind::RegUse:
3803 case InlineAsm::Kind::Imm:
3804 case InlineAsm::Kind::Mem:
3805 i += NumVals;
3806 break;
3807 case InlineAsm::Kind::Clobber:
3808 case InlineAsm::Kind::RegDef:
3809 case InlineAsm::Kind::RegDefEarlyClobber: {
3810 for (; NumVals; --NumVals, ++i) {
3811 Register Reg = cast<RegisterSDNode>(Val: Op.getOperand(i))->getReg();
3812 if (Reg != PPC::LR && Reg != PPC::LR8)
3813 continue;
3814 MFI.setLRStoreRequired();
3815 return Op;
3816 }
3817 break;
3818 }
3819 }
3820 }
3821
3822 return Op;
3823}
3824
3825SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
3826 SelectionDAG &DAG) const {
3827 SDValue Chain = Op.getOperand(i: 0);
3828 SDValue Trmp = Op.getOperand(i: 1); // trampoline
3829 SDValue FPtr = Op.getOperand(i: 2); // nested function
3830 SDValue Nest = Op.getOperand(i: 3); // 'nest' parameter value
3831 SDLoc dl(Op);
3832
3833 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3834
3835 if (Subtarget.isAIXABI()) {
3836 // On AIX we create a trampoline descriptor by combining the
3837 // entry point and TOC from the global descriptor (FPtr) with the
3838 // nest argument as the environment pointer.
3839 uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4;
3840 MaybeAlign PointerAlign(PointerSize);
3841 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
3842 ? (MachineMemOperand::MODereferenceable |
3843 MachineMemOperand::MOInvariant)
3844 : MachineMemOperand::MONone;
3845
3846 uint64_t TOCPointerOffset = 1 * PointerSize;
3847 uint64_t EnvPointerOffset = 2 * PointerSize;
3848 SDValue SDTOCPtrOffset = DAG.getConstant(Val: TOCPointerOffset, DL: dl, VT: PtrVT);
3849 SDValue SDEnvPtrOffset = DAG.getConstant(Val: EnvPointerOffset, DL: dl, VT: PtrVT);
3850
3851 const Value *TrampolineAddr =
3852 cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
3853 const Function *Func =
3854 cast<Function>(Val: cast<SrcValueSDNode>(Val: Op.getOperand(i: 5))->getValue());
3855
3856 SDValue OutChains[3];
3857
3858 // Copy the entry point address from the global descriptor to the
3859 // trampoline buffer.
3860 SDValue LoadEntryPoint =
3861 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: FPtr, PtrInfo: MachinePointerInfo(Func, 0),
3862 Alignment: PointerAlign, MMOFlags);
3863 SDValue EPLoadChain = LoadEntryPoint.getValue(R: 1);
3864 OutChains[0] = DAG.getStore(Chain: EPLoadChain, dl, Val: LoadEntryPoint, Ptr: Trmp,
3865 PtrInfo: MachinePointerInfo(TrampolineAddr, 0));
3866
3867 // Copy the TOC pointer from the global descriptor to the trampoline
3868 // buffer.
3869 SDValue TOCFromDescriptorPtr =
3870 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FPtr, N2: SDTOCPtrOffset);
3871 SDValue TOCReg = DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: TOCFromDescriptorPtr,
3872 PtrInfo: MachinePointerInfo(Func, TOCPointerOffset),
3873 Alignment: PointerAlign, MMOFlags);
3874 SDValue TrampolineTOCPointer =
3875 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Trmp, N2: SDTOCPtrOffset);
3876 SDValue TOCLoadChain = TOCReg.getValue(R: 1);
3877 OutChains[1] =
3878 DAG.getStore(Chain: TOCLoadChain, dl, Val: TOCReg, Ptr: TrampolineTOCPointer,
3879 PtrInfo: MachinePointerInfo(TrampolineAddr, TOCPointerOffset));
3880
3881 // Store the nest argument into the environment pointer in the trampoline
3882 // buffer.
3883 SDValue EnvPointer = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Trmp, N2: SDEnvPtrOffset);
3884 OutChains[2] =
3885 DAG.getStore(Chain, dl, Val: Nest, Ptr: EnvPointer,
3886 PtrInfo: MachinePointerInfo(TrampolineAddr, EnvPointerOffset));
3887
3888 SDValue TokenFactor =
3889 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: OutChains);
3890 return TokenFactor;
3891 }
3892
3893 bool isPPC64 = (PtrVT == MVT::i64);
3894 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
3895
3896 TargetLowering::ArgListTy Args;
3897 Args.emplace_back(args&: Trmp, args&: IntPtrTy);
3898 // TrampSize == (isPPC64 ? 48 : 40);
3899 Args.emplace_back(
3900 args: DAG.getConstant(Val: isPPC64 ? 48 : 40, DL: dl, VT: Subtarget.getScalarIntVT()),
3901 args&: IntPtrTy);
3902 Args.emplace_back(args&: FPtr, args&: IntPtrTy);
3903 Args.emplace_back(args&: Nest, args&: IntPtrTy);
3904
3905 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
3906 TargetLowering::CallLoweringInfo CLI(DAG);
3907 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3908 CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *DAG.getContext()),
3909 Target: DAG.getExternalSymbol(Sym: "__trampoline_setup", VT: PtrVT), ArgsList: std::move(Args));
3910
3911 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3912 return CallResult.second;
3913}
3914
3915SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3916 MachineFunction &MF = DAG.getMachineFunction();
3917 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3918 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
3919
3920 SDLoc dl(Op);
3921
3922 if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
3923 // vastart just stores the address of the VarArgsFrameIndex slot into the
3924 // memory location argument.
3925 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
3926 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
3927 return DAG.getStore(Chain: Op.getOperand(i: 0), dl, Val: FR, Ptr: Op.getOperand(i: 1),
3928 PtrInfo: MachinePointerInfo(SV));
3929 }
3930
3931 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
3932 // We suppose the given va_list is already allocated.
3933 //
3934 // typedef struct {
3935 // char gpr; /* index into the array of 8 GPRs
3936 // * stored in the register save area
3937 // * gpr=0 corresponds to r3,
3938 // * gpr=1 to r4, etc.
3939 // */
3940 // char fpr; /* index into the array of 8 FPRs
3941 // * stored in the register save area
3942 // * fpr=0 corresponds to f1,
3943 // * fpr=1 to f2, etc.
3944 // */
3945 // char *overflow_arg_area;
3946 // /* location on stack that holds
3947 // * the next overflow argument
3948 // */
3949 // char *reg_save_area;
3950 // /* where r3:r10 and f1:f8 (if saved)
3951 // * are stored
3952 // */
3953 // } va_list[1];
3954
3955 SDValue ArgGPR = DAG.getConstant(Val: FuncInfo->getVarArgsNumGPR(), DL: dl, VT: MVT::i32);
3956 SDValue ArgFPR = DAG.getConstant(Val: FuncInfo->getVarArgsNumFPR(), DL: dl, VT: MVT::i32);
3957 SDValue StackOffsetFI = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackOffset(),
3958 VT: PtrVT);
3959 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(),
3960 VT: PtrVT);
3961
3962 uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
3963 SDValue ConstFrameOffset = DAG.getConstant(Val: FrameOffset, DL: dl, VT: PtrVT);
3964
3965 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
3966 SDValue ConstStackOffset = DAG.getConstant(Val: StackOffset, DL: dl, VT: PtrVT);
3967
3968 uint64_t FPROffset = 1;
3969 SDValue ConstFPROffset = DAG.getConstant(Val: FPROffset, DL: dl, VT: PtrVT);
3970
3971 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
3972
3973 // Store first byte : number of int regs
3974 SDValue firstStore =
3975 DAG.getTruncStore(Chain: Op.getOperand(i: 0), dl, Val: ArgGPR, Ptr: Op.getOperand(i: 1),
3976 PtrInfo: MachinePointerInfo(SV), SVT: MVT::i8);
3977 uint64_t nextOffset = FPROffset;
3978 SDValue nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Op.getOperand(i: 1),
3979 N2: ConstFPROffset);
3980
3981 // Store second byte : number of float regs
3982 SDValue secondStore =
3983 DAG.getTruncStore(Chain: firstStore, dl, Val: ArgFPR, Ptr: nextPtr,
3984 PtrInfo: MachinePointerInfo(SV, nextOffset), SVT: MVT::i8);
3985 nextOffset += StackOffset;
3986 nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: nextPtr, N2: ConstStackOffset);
3987
3988 // Store second word : arguments given on stack
3989 SDValue thirdStore = DAG.getStore(Chain: secondStore, dl, Val: StackOffsetFI, Ptr: nextPtr,
3990 PtrInfo: MachinePointerInfo(SV, nextOffset));
3991 nextOffset += FrameOffset;
3992 nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: nextPtr, N2: ConstFrameOffset);
3993
3994 // Store third word : arguments given in registers
3995 return DAG.getStore(Chain: thirdStore, dl, Val: FR, Ptr: nextPtr,
3996 PtrInfo: MachinePointerInfo(SV, nextOffset));
3997}
3998
3999/// FPR - The set of FP registers that should be allocated for arguments
4000/// on Darwin and AIX.
4001static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
4002 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
4003 PPC::F11, PPC::F12, PPC::F13};
4004
4005/// CalculateStackSlotSize - Calculates the size reserved for this argument on
4006/// the stack.
4007static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
4008 unsigned PtrByteSize) {
4009 unsigned ArgSize = ArgVT.getStoreSize();
4010 if (Flags.isByVal())
4011 ArgSize = Flags.getByValSize();
4012
4013 // Round up to multiples of the pointer size, except for array members,
4014 // which are always packed.
4015 if (!Flags.isInConsecutiveRegs())
4016 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4017
4018 return ArgSize;
4019}
4020
4021/// CalculateStackSlotAlignment - Calculates the alignment of this argument
4022/// on the stack.
4023static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
4024 ISD::ArgFlagsTy Flags,
4025 unsigned PtrByteSize) {
4026 Align Alignment(PtrByteSize);
4027
4028 // Altivec parameters are padded to a 16 byte boundary.
4029 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4030 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4031 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4032 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4033 Alignment = Align(16);
4034
4035 // ByVal parameters are aligned as requested.
4036 if (Flags.isByVal()) {
4037 auto BVAlign = Flags.getNonZeroByValAlign();
4038 if (BVAlign > PtrByteSize) {
4039 if (BVAlign.value() % PtrByteSize != 0)
4040 llvm_unreachable(
4041 "ByVal alignment is not a multiple of the pointer size");
4042
4043 Alignment = BVAlign;
4044 }
4045 }
4046
4047 // Array members are always packed to their original alignment.
4048 if (Flags.isInConsecutiveRegs()) {
4049 // If the array member was split into multiple registers, the first
4050 // needs to be aligned to the size of the full type. (Except for
4051 // ppcf128, which is only aligned as its f64 components.)
4052 if (Flags.isSplit() && OrigVT != MVT::ppcf128)
4053 Alignment = Align(OrigVT.getStoreSize());
4054 else
4055 Alignment = Align(ArgVT.getStoreSize());
4056 }
4057
4058 return Alignment;
4059}
4060
4061/// CalculateStackSlotUsed - Return whether this argument will use its
4062/// stack slot (instead of being passed in registers). ArgOffset,
4063/// AvailableFPRs, and AvailableVRs must hold the current argument
4064/// position, and will be updated to account for this argument.
4065static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
4066 unsigned PtrByteSize, unsigned LinkageSize,
4067 unsigned ParamAreaSize, unsigned &ArgOffset,
4068 unsigned &AvailableFPRs,
4069 unsigned &AvailableVRs) {
4070 bool UseMemory = false;
4071
4072 // Respect alignment of argument on the stack.
4073 Align Alignment =
4074 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
4075 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
4076 // If there's no space left in the argument save area, we must
4077 // use memory (this check also catches zero-sized arguments).
4078 if (ArgOffset >= LinkageSize + ParamAreaSize)
4079 UseMemory = true;
4080
4081 // Allocate argument on the stack.
4082 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
4083 if (Flags.isInConsecutiveRegsLast())
4084 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4085 // If we overran the argument save area, we must use memory
4086 // (this check catches arguments passed partially in memory)
4087 if (ArgOffset > LinkageSize + ParamAreaSize)
4088 UseMemory = true;
4089
4090 // However, if the argument is actually passed in an FPR or a VR,
4091 // we don't use memory after all.
4092 if (!Flags.isByVal()) {
4093 if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
4094 if (AvailableFPRs > 0) {
4095 --AvailableFPRs;
4096 return false;
4097 }
4098 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4099 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4100 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4101 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4102 if (AvailableVRs > 0) {
4103 --AvailableVRs;
4104 return false;
4105 }
4106 }
4107
4108 return UseMemory;
4109}
4110
4111/// EnsureStackAlignment - Round stack frame size up from NumBytes to
4112/// ensure minimum alignment required for target.
4113static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
4114 unsigned NumBytes) {
4115 return alignTo(Size: NumBytes, A: Lowering->getStackAlign());
4116}
4117
4118SDValue PPCTargetLowering::LowerFormalArguments(
4119 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4120 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4121 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4122 if (Subtarget.isAIXABI())
4123 return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
4124 InVals);
4125 if (Subtarget.is64BitELFABI())
4126 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4127 InVals);
4128 assert(Subtarget.is32BitELFABI());
4129 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4130 InVals);
4131}
4132
4133SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
4134 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4135 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4136 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4137
4138 // 32-bit SVR4 ABI Stack Frame Layout:
4139 // +-----------------------------------+
4140 // +--> | Back chain |
4141 // | +-----------------------------------+
4142 // | | Floating-point register save area |
4143 // | +-----------------------------------+
4144 // | | General register save area |
4145 // | +-----------------------------------+
4146 // | | CR save word |
4147 // | +-----------------------------------+
4148 // | | VRSAVE save word |
4149 // | +-----------------------------------+
4150 // | | Alignment padding |
4151 // | +-----------------------------------+
4152 // | | Vector register save area |
4153 // | +-----------------------------------+
4154 // | | Local variable space |
4155 // | +-----------------------------------+
4156 // | | Parameter list area |
4157 // | +-----------------------------------+
4158 // | | LR save word |
4159 // | +-----------------------------------+
4160 // SP--> +--- | Back chain |
4161 // +-----------------------------------+
4162 //
4163 // Specifications:
4164 // System V Application Binary Interface PowerPC Processor Supplement
4165 // AltiVec Technology Programming Interface Manual
4166
4167 MachineFunction &MF = DAG.getMachineFunction();
4168 MachineFrameInfo &MFI = MF.getFrameInfo();
4169 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4170
4171 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
4172 // Potential tail calls could cause overwriting of argument stack slots.
4173 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4174 (CallConv == CallingConv::Fast));
4175 const Align PtrAlign(4);
4176
4177 // Assign locations to all of the incoming arguments.
4178 SmallVector<CCValAssign, 16> ArgLocs;
4179 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4180 *DAG.getContext());
4181
4182 // Reserve space for the linkage area on the stack.
4183 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4184 CCInfo.AllocateStack(Size: LinkageSize, Alignment: PtrAlign);
4185 CCInfo.AnalyzeFormalArguments(Ins, Fn: CC_PPC32_SVR4);
4186
4187 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4188 CCValAssign &VA = ArgLocs[i];
4189
4190 // Arguments stored in registers.
4191 if (VA.isRegLoc()) {
4192 const TargetRegisterClass *RC;
4193 EVT ValVT = VA.getValVT();
4194
4195 switch (ValVT.getSimpleVT().SimpleTy) {
4196 default:
4197 llvm_unreachable("ValVT not supported by formal arguments Lowering");
4198 case MVT::i1:
4199 case MVT::i32:
4200 RC = &PPC::GPRCRegClass;
4201 break;
4202 case MVT::f32:
4203 if (Subtarget.hasP8Vector())
4204 RC = &PPC::VSSRCRegClass;
4205 else if (Subtarget.hasSPE())
4206 RC = &PPC::GPRCRegClass;
4207 else
4208 RC = &PPC::F4RCRegClass;
4209 break;
4210 case MVT::f64:
4211 if (Subtarget.hasVSX())
4212 RC = &PPC::VSFRCRegClass;
4213 else if (Subtarget.hasSPE())
4214 // SPE passes doubles in GPR pairs.
4215 RC = &PPC::GPRCRegClass;
4216 else
4217 RC = &PPC::F8RCRegClass;
4218 break;
4219 case MVT::v16i8:
4220 case MVT::v8i16:
4221 case MVT::v4i32:
4222 RC = &PPC::VRRCRegClass;
4223 break;
4224 case MVT::v4f32:
4225 RC = &PPC::VRRCRegClass;
4226 break;
4227 case MVT::v2f64:
4228 case MVT::v2i64:
4229 RC = &PPC::VRRCRegClass;
4230 break;
4231 }
4232
4233 SDValue ArgValue;
4234 // Transform the arguments stored in physical registers into
4235 // virtual ones.
4236 if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
4237 assert(i + 1 < e && "No second half of double precision argument");
4238 Register RegLo = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4239 Register RegHi = MF.addLiveIn(PReg: ArgLocs[++i].getLocReg(), RC);
4240 SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, Reg: RegLo, VT: MVT::i32);
4241 SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, Reg: RegHi, VT: MVT::i32);
4242 if (!Subtarget.isLittleEndian())
4243 std::swap (a&: ArgValueLo, b&: ArgValueHi);
4244 ArgValue = DAG.getNode(Opcode: PPCISD::BUILD_SPE64, DL: dl, VT: MVT::f64, N1: ArgValueLo,
4245 N2: ArgValueHi);
4246 } else {
4247 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4248 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
4249 VT: ValVT == MVT::i1 ? MVT::i32 : ValVT);
4250 if (ValVT == MVT::i1)
4251 ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: ArgValue);
4252 }
4253
4254 InVals.push_back(Elt: ArgValue);
4255 } else {
4256 // Argument stored in memory.
4257 assert(VA.isMemLoc());
4258
4259 // Get the extended size of the argument type in stack
4260 unsigned ArgSize = VA.getLocVT().getStoreSize();
4261 // Get the actual size of the argument type
4262 unsigned ObjSize = VA.getValVT().getStoreSize();
4263 unsigned ArgOffset = VA.getLocMemOffset();
4264 // Stack objects in PPC32 are right justified.
4265 ArgOffset += ArgSize - ObjSize;
4266 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: isImmutable);
4267
4268 // Create load nodes to retrieve arguments from the stack.
4269 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4270 InVals.push_back(
4271 Elt: DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo()));
4272 }
4273 }
4274
4275 // Assign locations to all of the incoming aggregate by value arguments.
4276 // Aggregates passed by value are stored in the local variable space of the
4277 // caller's stack frame, right above the parameter list area.
4278 SmallVector<CCValAssign, 16> ByValArgLocs;
4279 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
4280 ByValArgLocs, *DAG.getContext());
4281
4282 // Reserve stack space for the allocations in CCInfo.
4283 CCByValInfo.AllocateStack(Size: CCInfo.getStackSize(), Alignment: PtrAlign);
4284
4285 CCByValInfo.AnalyzeFormalArguments(Ins, Fn: CC_PPC32_SVR4_ByVal);
4286
4287 // Area that is at least reserved in the caller of this function.
4288 unsigned MinReservedArea = CCByValInfo.getStackSize();
4289 MinReservedArea = std::max(a: MinReservedArea, b: LinkageSize);
4290
4291 // Set the size that is at least reserved in caller of this function. Tail
4292 // call optimized function's reserved stack space needs to be aligned so that
4293 // taking the difference between two stack areas will result in an aligned
4294 // stack.
4295 MinReservedArea =
4296 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: MinReservedArea);
4297 FuncInfo->setMinReservedArea(MinReservedArea);
4298
4299 SmallVector<SDValue, 8> MemOps;
4300
4301 // If the function takes variable number of arguments, make a frame index for
4302 // the start of the first vararg value... for expansion of llvm.va_start.
4303 if (isVarArg) {
4304 static const MCPhysReg GPArgRegs[] = {
4305 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
4306 PPC::R7, PPC::R8, PPC::R9, PPC::R10,
4307 };
4308 const unsigned NumGPArgRegs = std::size(GPArgRegs);
4309
4310 static const MCPhysReg FPArgRegs[] = {
4311 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
4312 PPC::F8
4313 };
4314 unsigned NumFPArgRegs = std::size(FPArgRegs);
4315
4316 if (useSoftFloat() || hasSPE())
4317 NumFPArgRegs = 0;
4318
4319 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(Regs: GPArgRegs));
4320 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(Regs: FPArgRegs));
4321
4322 // Make room for NumGPArgRegs and NumFPArgRegs.
4323 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
4324 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
4325
4326 FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject(
4327 Size: PtrVT.getSizeInBits() / 8, SPOffset: CCInfo.getStackSize(), IsImmutable: true));
4328
4329 FuncInfo->setVarArgsFrameIndex(
4330 MFI.CreateStackObject(Size: Depth, Alignment: Align(8), isSpillSlot: false));
4331 SDValue FIN = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4332
4333 // The fixed integer arguments of a variadic function are stored to the
4334 // VarArgsFrameIndex on the stack so that they may be loaded by
4335 // dereferencing the result of va_next.
4336 for (MCPhysReg GPArgReg : GPArgRegs) {
4337 // Get an existing live-in vreg, or add a new one.
4338 Register VReg = MF.getRegInfo().getLiveInVirtReg(PReg: GPArgReg);
4339 if (!VReg)
4340 VReg = MF.addLiveIn(PReg: GPArgReg, RC: &PPC::GPRCRegClass);
4341
4342 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4343 SDValue Store =
4344 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4345 MemOps.push_back(Elt: Store);
4346 // Increment the address by four for the next argument to store
4347 SDValue PtrOff = DAG.getConstant(Val: PtrVT.getSizeInBits()/8, DL: dl, VT: PtrVT);
4348 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4349 }
4350
4351 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
4352 // is set.
4353 // The double arguments are stored to the VarArgsFrameIndex
4354 // on the stack.
4355 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
4356 // Get an existing live-in vreg, or add a new one.
4357 Register VReg = MF.getRegInfo().getLiveInVirtReg(PReg: FPArgRegs[FPRIndex]);
4358 if (!VReg)
4359 VReg = MF.addLiveIn(PReg: FPArgRegs[FPRIndex], RC: &PPC::F8RCRegClass);
4360
4361 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::f64);
4362 SDValue Store =
4363 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4364 MemOps.push_back(Elt: Store);
4365 // Increment the address by eight for the next argument to store
4366 SDValue PtrOff = DAG.getConstant(Val: MVT(MVT::f64).getSizeInBits()/8, DL: dl,
4367 VT: PtrVT);
4368 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4369 }
4370 }
4371
4372 if (!MemOps.empty())
4373 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
4374
4375 return Chain;
4376}
4377
4378// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4379// value to MVT::i64 and then truncate to the correct register size.
4380SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
4381 EVT ObjectVT, SelectionDAG &DAG,
4382 SDValue ArgVal,
4383 const SDLoc &dl) const {
4384 if (Flags.isSExt())
4385 ArgVal = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: MVT::i64, N1: ArgVal,
4386 N2: DAG.getValueType(ObjectVT));
4387 else if (Flags.isZExt())
4388 ArgVal = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: MVT::i64, N1: ArgVal,
4389 N2: DAG.getValueType(ObjectVT));
4390
4391 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ObjectVT, Operand: ArgVal);
4392}
4393
4394SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
4395 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4396 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4397 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4398 // TODO: add description of PPC stack frame format, or at least some docs.
4399 //
4400 bool isELFv2ABI = Subtarget.isELFv2ABI();
4401 bool isLittleEndian = Subtarget.isLittleEndian();
4402 MachineFunction &MF = DAG.getMachineFunction();
4403 MachineFrameInfo &MFI = MF.getFrameInfo();
4404 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4405
4406 assert(!(CallConv == CallingConv::Fast && isVarArg) &&
4407 "fastcc not supported on varargs functions");
4408
4409 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
4410 // Potential tail calls could cause overwriting of argument stack slots.
4411 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4412 (CallConv == CallingConv::Fast));
4413 unsigned PtrByteSize = 8;
4414 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4415
4416 static const MCPhysReg GPR[] = {
4417 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4418 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4419 };
4420 static const MCPhysReg VR[] = {
4421 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4422 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4423 };
4424
4425 const unsigned Num_GPR_Regs = std::size(GPR);
4426 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
4427 const unsigned Num_VR_Regs = std::size(VR);
4428
4429 // Do a first pass over the arguments to determine whether the ABI
4430 // guarantees that our caller has allocated the parameter save area
4431 // on its stack frame. In the ELFv1 ABI, this is always the case;
4432 // in the ELFv2 ABI, it is true if this is a vararg function or if
4433 // any parameter is located in a stack slot.
4434
4435 bool HasParameterArea = !isELFv2ABI || isVarArg;
4436 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
4437 unsigned NumBytes = LinkageSize;
4438 unsigned AvailableFPRs = Num_FPR_Regs;
4439 unsigned AvailableVRs = Num_VR_Regs;
4440 for (const ISD::InputArg &In : Ins) {
4441 if (In.Flags.isNest())
4442 continue;
4443
4444 if (CalculateStackSlotUsed(ArgVT: In.VT, OrigVT: In.ArgVT, Flags: In.Flags, PtrByteSize,
4445 LinkageSize, ParamAreaSize, ArgOffset&: NumBytes,
4446 AvailableFPRs, AvailableVRs))
4447 HasParameterArea = true;
4448 }
4449
4450 // Add DAG nodes to load the arguments or copy them out of registers. On
4451 // entry to a function on PPC, the arguments start after the linkage area,
4452 // although the first ones are often in registers.
4453
4454 unsigned ArgOffset = LinkageSize;
4455 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
4456 SmallVector<SDValue, 8> MemOps;
4457 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
4458 unsigned CurArgIdx = 0;
4459 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
4460 SDValue ArgVal;
4461 bool needsLoad = false;
4462 EVT ObjectVT = Ins[ArgNo].VT;
4463 EVT OrigVT = Ins[ArgNo].ArgVT;
4464 unsigned ObjSize = ObjectVT.getStoreSize();
4465 unsigned ArgSize = ObjSize;
4466 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
4467 if (Ins[ArgNo].isOrigArg()) {
4468 std::advance(i&: FuncArg, n: Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
4469 CurArgIdx = Ins[ArgNo].getOrigArgIndex();
4470 }
4471 // We re-align the argument offset for each argument, except when using the
4472 // fast calling convention, when we need to make sure we do that only when
4473 // we'll actually use a stack slot.
4474 unsigned CurArgOffset;
4475 Align Alignment;
4476 auto ComputeArgOffset = [&]() {
4477 /* Respect alignment of argument on the stack. */
4478 Alignment =
4479 CalculateStackSlotAlignment(ArgVT: ObjectVT, OrigVT, Flags, PtrByteSize);
4480 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
4481 CurArgOffset = ArgOffset;
4482 };
4483
4484 if (CallConv != CallingConv::Fast) {
4485 ComputeArgOffset();
4486
4487 /* Compute GPR index associated with argument offset. */
4488 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4489 GPR_idx = std::min(a: GPR_idx, b: Num_GPR_Regs);
4490 }
4491
4492 // FIXME the codegen can be much improved in some cases.
4493 // We do not have to keep everything in memory.
4494 if (Flags.isByVal()) {
4495 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
4496
4497 if (CallConv == CallingConv::Fast)
4498 ComputeArgOffset();
4499
4500 // ObjSize is the true size, ArgSize rounded up to multiple of registers.
4501 ObjSize = Flags.getByValSize();
4502 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4503 // Empty aggregate parameters do not take up registers. Examples:
4504 // struct { } a;
4505 // union { } b;
4506 // int c[0];
4507 // etc. However, we have to provide a place-holder in InVals, so
4508 // pretend we have an 8-byte item at the current address for that
4509 // purpose.
4510 if (!ObjSize) {
4511 int FI = MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: ArgOffset, IsImmutable: true);
4512 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4513 InVals.push_back(Elt: FIN);
4514 continue;
4515 }
4516
4517 // Create a stack object covering all stack doublewords occupied
4518 // by the argument. If the argument is (fully or partially) on
4519 // the stack, or if the argument is fully in registers but the
4520 // caller has allocated the parameter save anyway, we can refer
4521 // directly to the caller's stack frame. Otherwise, create a
4522 // local copy in our own frame.
4523 int FI;
4524 if (HasParameterArea ||
4525 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
4526 FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: false, isAliased: true);
4527 else
4528 FI = MFI.CreateStackObject(Size: ArgSize, Alignment, isSpillSlot: false);
4529 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4530
4531 // Handle aggregates smaller than 8 bytes.
4532 if (ObjSize < PtrByteSize) {
4533 // The value of the object is its address, which differs from the
4534 // address of the enclosing doubleword on big-endian systems.
4535 SDValue Arg = FIN;
4536 if (!isLittleEndian) {
4537 SDValue ArgOff = DAG.getConstant(Val: PtrByteSize - ObjSize, DL: dl, VT: PtrVT);
4538 Arg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: ArgOff.getValueType(), N1: Arg, N2: ArgOff);
4539 }
4540 InVals.push_back(Elt: Arg);
4541
4542 if (GPR_idx != Num_GPR_Regs) {
4543 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4544 FuncInfo->addLiveInAttr(VReg, Flags);
4545 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4546 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: ObjSize * 8);
4547 SDValue Store =
4548 DAG.getTruncStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: Arg,
4549 PtrInfo: MachinePointerInfo(&*FuncArg), SVT: ObjType);
4550 MemOps.push_back(Elt: Store);
4551 }
4552 // Whether we copied from a register or not, advance the offset
4553 // into the parameter save area by a full doubleword.
4554 ArgOffset += PtrByteSize;
4555 continue;
4556 }
4557
4558 // The value of the object is its address, which is the address of
4559 // its first stack doubleword.
4560 InVals.push_back(Elt: FIN);
4561
4562 // Store whatever pieces of the object are in registers to memory.
4563 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
4564 if (GPR_idx == Num_GPR_Regs)
4565 break;
4566
4567 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx], RC: &PPC::G8RCRegClass);
4568 FuncInfo->addLiveInAttr(VReg, Flags);
4569 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4570 SDValue Addr = FIN;
4571 if (j) {
4572 SDValue Off = DAG.getConstant(Val: j, DL: dl, VT: PtrVT);
4573 Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Off.getValueType(), N1: Addr, N2: Off);
4574 }
4575 unsigned StoreSizeInBits = std::min(a: PtrByteSize, b: (ObjSize - j)) * 8;
4576 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreSizeInBits);
4577 SDValue Store =
4578 DAG.getTruncStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: Addr,
4579 PtrInfo: MachinePointerInfo(&*FuncArg, j), SVT: ObjType);
4580 MemOps.push_back(Elt: Store);
4581 ++GPR_idx;
4582 }
4583 ArgOffset += ArgSize;
4584 continue;
4585 }
4586
4587 switch (ObjectVT.getSimpleVT().SimpleTy) {
4588 default: llvm_unreachable("Unhandled argument type!");
4589 case MVT::i1:
4590 case MVT::i32:
4591 case MVT::i64:
4592 if (Flags.isNest()) {
4593 // The 'nest' parameter, if any, is passed in R11.
4594 Register VReg = MF.addLiveIn(PReg: PPC::X11, RC: &PPC::G8RCRegClass);
4595 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4596
4597 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4598 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4599
4600 break;
4601 }
4602
4603 // These can be scalar arguments or elements of an integer array type
4604 // passed directly. Clang may use those instead of "byval" aggregate
4605 // types to avoid forcing arguments to memory unnecessarily.
4606 if (GPR_idx != Num_GPR_Regs) {
4607 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4608 FuncInfo->addLiveInAttr(VReg, Flags);
4609 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4610
4611 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4612 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4613 // value to MVT::i64 and then truncate to the correct register size.
4614 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4615 } else {
4616 if (CallConv == CallingConv::Fast)
4617 ComputeArgOffset();
4618
4619 needsLoad = true;
4620 ArgSize = PtrByteSize;
4621 }
4622 if (CallConv != CallingConv::Fast || needsLoad)
4623 ArgOffset += 8;
4624 break;
4625
4626 case MVT::f32:
4627 case MVT::f64:
4628 // These can be scalar arguments or elements of a float array type
4629 // passed directly. The latter are used to implement ELFv2 homogenous
4630 // float aggregates.
4631 if (FPR_idx != Num_FPR_Regs) {
4632 unsigned VReg;
4633
4634 if (ObjectVT == MVT::f32)
4635 VReg = MF.addLiveIn(PReg: FPR[FPR_idx],
4636 RC: Subtarget.hasP8Vector()
4637 ? &PPC::VSSRCRegClass
4638 : &PPC::F4RCRegClass);
4639 else
4640 VReg = MF.addLiveIn(PReg: FPR[FPR_idx], RC: Subtarget.hasVSX()
4641 ? &PPC::VSFRCRegClass
4642 : &PPC::F8RCRegClass);
4643
4644 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: ObjectVT);
4645 ++FPR_idx;
4646 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
4647 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
4648 // once we support fp <-> gpr moves.
4649
4650 // This can only ever happen in the presence of f32 array types,
4651 // since otherwise we never run out of FPRs before running out
4652 // of GPRs.
4653 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4654 FuncInfo->addLiveInAttr(VReg, Flags);
4655 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4656
4657 if (ObjectVT == MVT::f32) {
4658 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
4659 ArgVal = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: ArgVal,
4660 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
4661 ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: ArgVal);
4662 }
4663
4664 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ObjectVT, Operand: ArgVal);
4665 } else {
4666 if (CallConv == CallingConv::Fast)
4667 ComputeArgOffset();
4668
4669 needsLoad = true;
4670 }
4671
4672 // When passing an array of floats, the array occupies consecutive
4673 // space in the argument area; only round up to the next doubleword
4674 // at the end of the array. Otherwise, each float takes 8 bytes.
4675 if (CallConv != CallingConv::Fast || needsLoad) {
4676 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
4677 ArgOffset += ArgSize;
4678 if (Flags.isInConsecutiveRegsLast())
4679 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4680 }
4681 break;
4682 case MVT::v4f32:
4683 case MVT::v4i32:
4684 case MVT::v8i16:
4685 case MVT::v16i8:
4686 case MVT::v2f64:
4687 case MVT::v2i64:
4688 case MVT::v1i128:
4689 case MVT::f128:
4690 // These can be scalar arguments or elements of a vector array type
4691 // passed directly. The latter are used to implement ELFv2 homogenous
4692 // vector aggregates.
4693 if (VR_idx != Num_VR_Regs) {
4694 Register VReg = MF.addLiveIn(PReg: VR[VR_idx], RC: &PPC::VRRCRegClass);
4695 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: ObjectVT);
4696 ++VR_idx;
4697 } else {
4698 if (CallConv == CallingConv::Fast)
4699 ComputeArgOffset();
4700 needsLoad = true;
4701 }
4702 if (CallConv != CallingConv::Fast || needsLoad)
4703 ArgOffset += 16;
4704 break;
4705 }
4706
4707 // We need to load the argument to a virtual register if we determined
4708 // above that we ran out of physical registers of the appropriate type.
4709 if (needsLoad) {
4710 if (ObjSize < ArgSize && !isLittleEndian)
4711 CurArgOffset += ArgSize - ObjSize;
4712 int FI = MFI.CreateFixedObject(Size: ObjSize, SPOffset: CurArgOffset, IsImmutable: isImmutable);
4713 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4714 ArgVal = DAG.getLoad(VT: ObjectVT, dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo());
4715 }
4716
4717 InVals.push_back(Elt: ArgVal);
4718 }
4719
4720 // Area that is at least reserved in the caller of this function.
4721 unsigned MinReservedArea;
4722 if (HasParameterArea)
4723 MinReservedArea = std::max(a: ArgOffset, b: LinkageSize + 8 * PtrByteSize);
4724 else
4725 MinReservedArea = LinkageSize;
4726
4727 // Set the size that is at least reserved in caller of this function. Tail
4728 // call optimized functions' reserved stack space needs to be aligned so that
4729 // taking the difference between two stack areas will result in an aligned
4730 // stack.
4731 MinReservedArea =
4732 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: MinReservedArea);
4733 FuncInfo->setMinReservedArea(MinReservedArea);
4734
4735 // If the function takes variable number of arguments, make a frame index for
4736 // the start of the first vararg value... for expansion of llvm.va_start.
4737 // On ELFv2ABI spec, it writes:
4738 // C programs that are intended to be *portable* across different compilers
4739 // and architectures must use the header file <stdarg.h> to deal with variable
4740 // argument lists.
4741 if (isVarArg && MFI.hasVAStart()) {
4742 int Depth = ArgOffset;
4743
4744 FuncInfo->setVarArgsFrameIndex(
4745 MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: Depth, IsImmutable: true));
4746 SDValue FIN = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4747
4748 // If this function is vararg, store any remaining integer argument regs
4749 // to their spots on the stack so that they may be loaded by dereferencing
4750 // the result of va_next.
4751 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4752 GPR_idx < Num_GPR_Regs; ++GPR_idx) {
4753 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx], RC: &PPC::G8RCRegClass);
4754 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4755 SDValue Store =
4756 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4757 MemOps.push_back(Elt: Store);
4758 // Increment the address by four for the next argument to store
4759 SDValue PtrOff = DAG.getConstant(Val: PtrByteSize, DL: dl, VT: PtrVT);
4760 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4761 }
4762 }
4763
4764 if (!MemOps.empty())
4765 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
4766
4767 return Chain;
4768}
4769
4770/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
4771/// adjusted to accommodate the arguments for the tailcall.
4772static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
4773 unsigned ParamSize) {
4774
4775 if (!isTailCall) return 0;
4776
4777 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
4778 unsigned CallerMinReservedArea = FI->getMinReservedArea();
4779 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
4780 // Remember only if the new adjustment is bigger.
4781 if (SPDiff < FI->getTailCallSPDelta())
4782 FI->setTailCallSPDelta(SPDiff);
4783
4784 return SPDiff;
4785}
4786
4787static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV);
4788
4789static bool callsShareTOCBase(const Function *Caller,
4790 const GlobalValue *CalleeGV,
4791 const TargetMachine &TM) {
4792 // It does not make sense to call callsShareTOCBase() with a caller that
4793 // is PC Relative since PC Relative callers do not have a TOC.
4794#ifndef NDEBUG
4795 const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
4796 assert(!STICaller->isUsingPCRelativeCalls() &&
4797 "PC Relative callers do not have a TOC and cannot share a TOC Base");
4798#endif
4799
4800 // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
4801 // don't have enough information to determine if the caller and callee share
4802 // the same TOC base, so we have to pessimistically assume they don't for
4803 // correctness.
4804 if (!CalleeGV)
4805 return false;
4806
4807 // If the callee is preemptable, then the static linker will use a plt-stub
4808 // which saves the toc to the stack, and needs a nop after the call
4809 // instruction to convert to a toc-restore.
4810 if (!TM.shouldAssumeDSOLocal(GV: CalleeGV))
4811 return false;
4812
4813 // Functions with PC Relative enabled may clobber the TOC in the same DSO.
4814 // We may need a TOC restore in the situation where the caller requires a
4815 // valid TOC but the callee is PC Relative and does not.
4816 const Function *F = dyn_cast<Function>(Val: CalleeGV);
4817 const GlobalAlias *Alias = dyn_cast<GlobalAlias>(Val: CalleeGV);
4818
4819 // If we have an Alias we can try to get the function from there.
4820 if (Alias) {
4821 const GlobalObject *GlobalObj = Alias->getAliaseeObject();
4822 F = dyn_cast<Function>(Val: GlobalObj);
4823 }
4824
4825 // If we still have no valid function pointer we do not have enough
4826 // information to determine if the callee uses PC Relative calls so we must
4827 // assume that it does.
4828 if (!F)
4829 return false;
4830
4831 // If the callee uses PC Relative we cannot guarantee that the callee won't
4832 // clobber the TOC of the caller and so we must assume that the two
4833 // functions do not share a TOC base.
4834 const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(F: *F);
4835 if (STICallee->isUsingPCRelativeCalls())
4836 return false;
4837
4838 // If the GV is not a strong definition then we need to assume it can be
4839 // replaced by another function at link time. The function that replaces
4840 // it may not share the same TOC as the caller since the callee may be
4841 // replaced by a PC Relative version of the same function.
4842 if (!CalleeGV->isStrongDefinitionForLinker())
4843 return false;
4844
4845 // The medium and large code models are expected to provide a sufficiently
4846 // large TOC to provide all data addressing needs of a module with a
4847 // single TOC.
4848 if (CodeModel::Medium == TM.getCodeModel() ||
4849 CodeModel::Large == TM.getCodeModel())
4850 return true;
4851
4852 // Any explicitly-specified sections and section prefixes must also match.
4853 // Also, if we're using -ffunction-sections, then each function is always in
4854 // a different section (the same is true for COMDAT functions).
4855 if (TM.getFunctionSections() || CalleeGV->hasComdat() ||
4856 Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection())
4857 return false;
4858 if (const auto *F = dyn_cast<Function>(Val: CalleeGV)) {
4859 if (F->getSectionPrefix() != Caller->getSectionPrefix())
4860 return false;
4861 }
4862
4863 return true;
4864}
4865
4866static bool
4867needStackSlotPassParameters(const PPCSubtarget &Subtarget,
4868 const SmallVectorImpl<ISD::OutputArg> &Outs) {
4869 assert(Subtarget.is64BitELFABI());
4870
4871 const unsigned PtrByteSize = 8;
4872 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4873
4874 static const MCPhysReg GPR[] = {
4875 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4876 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4877 };
4878 static const MCPhysReg VR[] = {
4879 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4880 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4881 };
4882
4883 const unsigned NumGPRs = std::size(GPR);
4884 const unsigned NumFPRs = 13;
4885 const unsigned NumVRs = std::size(VR);
4886 const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
4887
4888 unsigned NumBytes = LinkageSize;
4889 unsigned AvailableFPRs = NumFPRs;
4890 unsigned AvailableVRs = NumVRs;
4891
4892 for (const ISD::OutputArg& Param : Outs) {
4893 if (Param.Flags.isNest()) continue;
4894
4895 if (CalculateStackSlotUsed(ArgVT: Param.VT, OrigVT: Param.ArgVT, Flags: Param.Flags, PtrByteSize,
4896 LinkageSize, ParamAreaSize, ArgOffset&: NumBytes,
4897 AvailableFPRs, AvailableVRs))
4898 return true;
4899 }
4900 return false;
4901}
4902
4903static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
4904 if (CB.arg_size() != CallerFn->arg_size())
4905 return false;
4906
4907 auto CalleeArgIter = CB.arg_begin();
4908 auto CalleeArgEnd = CB.arg_end();
4909 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
4910
4911 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
4912 const Value* CalleeArg = *CalleeArgIter;
4913 const Value* CallerArg = &(*CallerArgIter);
4914 if (CalleeArg == CallerArg)
4915 continue;
4916
4917 // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
4918 // tail call @callee([4 x i64] undef, [4 x i64] %b)
4919 // }
4920 // 1st argument of callee is undef and has the same type as caller.
4921 if (CalleeArg->getType() == CallerArg->getType() &&
4922 isa<UndefValue>(Val: CalleeArg))
4923 continue;
4924
4925 return false;
4926 }
4927
4928 return true;
4929}
4930
4931// Returns true if TCO is possible between the callers and callees
4932// calling conventions.
4933static bool
4934areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
4935 CallingConv::ID CalleeCC) {
4936 // Tail calls are possible with fastcc and ccc.
4937 auto isTailCallableCC = [] (CallingConv::ID CC){
4938 return CC == CallingConv::C || CC == CallingConv::Fast;
4939 };
4940 if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
4941 return false;
4942
4943 // We can safely tail call both fastcc and ccc callees from a c calling
4944 // convention caller. If the caller is fastcc, we may have less stack space
4945 // than a non-fastcc caller with the same signature so disable tail-calls in
4946 // that case.
4947 return CallerCC == CallingConv::C || CallerCC == CalleeCC;
4948}
4949
4950bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
4951 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
4952 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
4953 const SmallVectorImpl<ISD::OutputArg> &Outs,
4954 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
4955 bool isCalleeExternalSymbol) const {
4956 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
4957
4958 if (DisableSCO && !TailCallOpt) return false;
4959
4960 // Variadic argument functions are not supported.
4961 if (isVarArg) return false;
4962
4963 // Check that the calling conventions are compatible for tco.
4964 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC))
4965 return false;
4966
4967 // Caller contains any byval parameter is not supported.
4968 if (any_of(Range: Ins, P: [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
4969 return false;
4970
4971 // Callee contains any byval parameter is not supported, too.
4972 // Note: This is a quick work around, because in some cases, e.g.
4973 // caller's stack size > callee's stack size, we are still able to apply
4974 // sibling call optimization. For example, gcc is able to do SCO for caller1
4975 // in the following example, but not for caller2.
4976 // struct test {
4977 // long int a;
4978 // char ary[56];
4979 // } gTest;
4980 // __attribute__((noinline)) int callee(struct test v, struct test *b) {
4981 // b->a = v.a;
4982 // return 0;
4983 // }
4984 // void caller1(struct test a, struct test c, struct test *b) {
4985 // callee(gTest, b); }
4986 // void caller2(struct test *b) { callee(gTest, b); }
4987 if (any_of(Range: Outs, P: [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
4988 return false;
4989
4990 // If callee and caller use different calling conventions, we cannot pass
4991 // parameters on stack since offsets for the parameter area may be different.
4992 if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs))
4993 return false;
4994
4995 // All variants of 64-bit ELF ABIs without PC-Relative addressing require that
4996 // the caller and callee share the same TOC for TCO/SCO. If the caller and
4997 // callee potentially have different TOC bases then we cannot tail call since
4998 // we need to restore the TOC pointer after the call.
4999 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
5000 // We cannot guarantee this for indirect calls or calls to external functions.
5001 // When PC-Relative addressing is used, the concept of the TOC is no longer
5002 // applicable so this check is not required.
5003 // Check first for indirect calls.
5004 if (!Subtarget.isUsingPCRelativeCalls() &&
5005 !isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol)
5006 return false;
5007
5008 // Check if we share the TOC base.
5009 if (!Subtarget.isUsingPCRelativeCalls() &&
5010 !callsShareTOCBase(Caller: CallerFunc, CalleeGV, TM: getTargetMachine()))
5011 return false;
5012
5013 // TCO allows altering callee ABI, so we don't have to check further.
5014 if (CalleeCC == CallingConv::Fast && TailCallOpt)
5015 return true;
5016
5017 if (DisableSCO) return false;
5018
5019 // If callee use the same argument list that caller is using, then we can
5020 // apply SCO on this case. If it is not, then we need to check if callee needs
5021 // stack for passing arguments.
5022 // PC Relative tail calls may not have a CallBase.
5023 // If there is no CallBase we cannot verify if we have the same argument
5024 // list so assume that we don't have the same argument list.
5025 if (CB && !hasSameArgumentList(CallerFn: CallerFunc, CB: *CB) &&
5026 needStackSlotPassParameters(Subtarget, Outs))
5027 return false;
5028 else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
5029 return false;
5030
5031 return true;
5032}
5033
5034/// IsEligibleForTailCallOptimization - Check whether the call is eligible
5035/// for tail call optimization. Targets which want to do tail call
5036/// optimization should implement this function.
5037bool PPCTargetLowering::IsEligibleForTailCallOptimization(
5038 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5039 CallingConv::ID CallerCC, bool isVarArg,
5040 const SmallVectorImpl<ISD::InputArg> &Ins) const {
5041 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5042 return false;
5043
5044 // Variable argument functions are not supported.
5045 if (isVarArg)
5046 return false;
5047
5048 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
5049 // Functions containing by val parameters are not supported.
5050 if (any_of(Range: Ins, P: [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
5051 return false;
5052
5053 // Non-PIC/GOT tail calls are supported.
5054 if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
5055 return true;
5056
5057 // At the moment we can only do local tail calls (in same module, hidden
5058 // or protected) if we are generating PIC.
5059 if (CalleeGV)
5060 return CalleeGV->hasHiddenVisibility() ||
5061 CalleeGV->hasProtectedVisibility();
5062 }
5063
5064 return false;
5065}
5066
5067/// isCallCompatibleAddress - Return the immediate to use if the specified
5068/// 32-bit value is representable in the immediate field of a BxA instruction.
5069static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
5070 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
5071 if (!C) return nullptr;
5072
5073 int Addr = C->getZExtValue();
5074 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
5075 SignExtend32<26>(X: Addr) != Addr)
5076 return nullptr; // Top 6 bits have to be sext of immediate.
5077
5078 return DAG
5079 .getSignedConstant(
5080 Val: (int)C->getZExtValue() >> 2, DL: SDLoc(Op),
5081 VT: DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout()))
5082 .getNode();
5083}
5084
5085namespace {
5086
5087struct TailCallArgumentInfo {
5088 SDValue Arg;
5089 SDValue FrameIdxOp;
5090 int FrameIdx = 0;
5091
5092 TailCallArgumentInfo() = default;
5093};
5094
5095} // end anonymous namespace
5096
5097/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
5098static void StoreTailCallArgumentsToStackSlot(
5099 SelectionDAG &DAG, SDValue Chain,
5100 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
5101 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
5102 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
5103 SDValue Arg = TailCallArgs[i].Arg;
5104 SDValue FIN = TailCallArgs[i].FrameIdxOp;
5105 int FI = TailCallArgs[i].FrameIdx;
5106 // Store relative to framepointer.
5107 MemOpChains.push_back(Elt: DAG.getStore(
5108 Chain, dl, Val: Arg, Ptr: FIN,
5109 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI)));
5110 }
5111}
5112
5113/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
5114/// the appropriate stack slot for the tail call optimized function call.
5115static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
5116 SDValue OldRetAddr, SDValue OldFP,
5117 int SPDiff, const SDLoc &dl) {
5118 if (SPDiff) {
5119 // Calculate the new stack slot for the return address.
5120 MachineFunction &MF = DAG.getMachineFunction();
5121 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
5122 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
5123 int SlotSize = Subtarget.isPPC64() ? 8 : 4;
5124 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
5125 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(Size: SlotSize,
5126 SPOffset: NewRetAddrLoc, IsImmutable: true);
5127 SDValue NewRetAddrFrIdx =
5128 DAG.getFrameIndex(FI: NewRetAddr, VT: Subtarget.getScalarIntVT());
5129 Chain = DAG.getStore(Chain, dl, Val: OldRetAddr, Ptr: NewRetAddrFrIdx,
5130 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: NewRetAddr));
5131 }
5132 return Chain;
5133}
5134
5135/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
5136/// the position of the argument.
5137static void CalculateTailCallArgDest(
5138 SelectionDAG &DAG, MachineFunction &MF, bool IsPPC64, SDValue Arg,
5139 int SPDiff, unsigned ArgOffset,
5140 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5141 int Offset = ArgOffset + SPDiff;
5142 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
5143 int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
5144 EVT VT = IsPPC64 ? MVT::i64 : MVT::i32;
5145 SDValue FIN = DAG.getFrameIndex(FI, VT);
5146 TailCallArgumentInfo Info;
5147 Info.Arg = Arg;
5148 Info.FrameIdxOp = FIN;
5149 Info.FrameIdx = FI;
5150 TailCallArguments.push_back(Elt: Info);
5151}
5152
5153/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
5154/// stack slot. Returns the chain as result and the loaded frame pointers in
5155/// LROpOut/FPOpout. Used when tail calling.
5156SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
5157 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
5158 SDValue &FPOpOut, const SDLoc &dl) const {
5159 if (SPDiff) {
5160 // Load the LR and FP stack slot for later adjusting.
5161 LROpOut = getReturnAddrFrameIndex(DAG);
5162 LROpOut = DAG.getLoad(VT: Subtarget.getScalarIntVT(), dl, Chain, Ptr: LROpOut,
5163 PtrInfo: MachinePointerInfo());
5164 Chain = SDValue(LROpOut.getNode(), 1);
5165 }
5166 return Chain;
5167}
5168
5169/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
5170/// by "Src" to address "Dst" of size "Size". Alignment information is
5171/// specified by the specific parameter attribute. The copy will be passed as
5172/// a byval function parameter.
5173/// Sometimes what we are copying is the end of a larger object, the part that
5174/// does not fit in registers.
5175static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
5176 SDValue Chain, ISD::ArgFlagsTy Flags,
5177 SelectionDAG &DAG, const SDLoc &dl) {
5178 SDValue SizeNode = DAG.getConstant(Val: Flags.getByValSize(), DL: dl, VT: MVT::i32);
5179 return DAG.getMemcpy(
5180 Chain, dl, Dst, Src, Size: SizeNode, Alignment: Flags.getNonZeroByValAlign(), isVol: false, AlwaysInline: false,
5181 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
5182}
5183
5184/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
5185/// tail calls.
5186static void LowerMemOpCallTo(
5187 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
5188 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
5189 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
5190 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
5191 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
5192 if (!isTailCall) {
5193 if (isVector) {
5194 SDValue StackPtr;
5195 if (isPPC64)
5196 StackPtr = DAG.getRegister(Reg: PPC::X1, VT: MVT::i64);
5197 else
5198 StackPtr = DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
5199 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr,
5200 N2: DAG.getConstant(Val: ArgOffset, DL: dl, VT: PtrVT));
5201 }
5202 MemOpChains.push_back(
5203 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo()));
5204 // Calculate and remember argument location.
5205 } else
5206 CalculateTailCallArgDest(DAG, MF, IsPPC64: isPPC64, Arg, SPDiff, ArgOffset,
5207 TailCallArguments);
5208}
5209
5210static void
5211PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain,
5212 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
5213 SDValue FPOp,
5214 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5215 // Emit a sequence of copyto/copyfrom virtual registers for arguments that
5216 // might overwrite each other in case of tail call optimization.
5217 SmallVector<SDValue, 8> MemOpChains2;
5218 // Do not flag preceding copytoreg stuff together with the following stuff.
5219 InGlue = SDValue();
5220 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArgs: TailCallArguments,
5221 MemOpChains&: MemOpChains2, dl);
5222 if (!MemOpChains2.empty())
5223 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains2);
5224
5225 // Store the return address to the appropriate stack slot.
5226 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, OldRetAddr: LROp, OldFP: FPOp, SPDiff, dl);
5227
5228 // Emit callseq_end just before tailcall node.
5229 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL: dl);
5230 InGlue = Chain.getValue(R: 1);
5231}
5232
5233// Is this global address that of a function that can be called by name? (as
5234// opposed to something that must hold a descriptor for an indirect call).
5235static bool isFunctionGlobalAddress(const GlobalValue *GV) {
5236 if (GV) {
5237 if (GV->isThreadLocal())
5238 return false;
5239
5240 return GV->getValueType()->isFunctionTy();
5241 }
5242
5243 return false;
5244}
5245
5246SDValue PPCTargetLowering::LowerCallResult(
5247 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
5248 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5249 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5250 SmallVector<CCValAssign, 16> RVLocs;
5251 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5252 *DAG.getContext());
5253
5254 CCRetInfo.AnalyzeCallResult(
5255 Ins, Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
5256 ? RetCC_PPC_Cold
5257 : RetCC_PPC);
5258
5259 // Copy all of the result registers out of their specified physreg.
5260 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5261 CCValAssign &VA = RVLocs[i];
5262 assert(VA.isRegLoc() && "Can only return in registers!");
5263
5264 SDValue Val;
5265
5266 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
5267 SDValue Lo = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: MVT::i32,
5268 Glue: InGlue);
5269 Chain = Lo.getValue(R: 1);
5270 InGlue = Lo.getValue(R: 2);
5271 VA = RVLocs[++i]; // skip ahead to next loc
5272 SDValue Hi = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: MVT::i32,
5273 Glue: InGlue);
5274 Chain = Hi.getValue(R: 1);
5275 InGlue = Hi.getValue(R: 2);
5276 if (!Subtarget.isLittleEndian())
5277 std::swap (a&: Lo, b&: Hi);
5278 Val = DAG.getNode(Opcode: PPCISD::BUILD_SPE64, DL: dl, VT: MVT::f64, N1: Lo, N2: Hi);
5279 } else {
5280 Val = DAG.getCopyFromReg(Chain, dl,
5281 Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
5282 Chain = Val.getValue(R: 1);
5283 InGlue = Val.getValue(R: 2);
5284 }
5285
5286 switch (VA.getLocInfo()) {
5287 default: llvm_unreachable("Unknown loc info!");
5288 case CCValAssign::Full: break;
5289 case CCValAssign::AExt:
5290 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5291 break;
5292 case CCValAssign::ZExt:
5293 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: VA.getLocVT(), N1: Val,
5294 N2: DAG.getValueType(VA.getValVT()));
5295 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5296 break;
5297 case CCValAssign::SExt:
5298 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: VA.getLocVT(), N1: Val,
5299 N2: DAG.getValueType(VA.getValVT()));
5300 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5301 break;
5302 }
5303
5304 InVals.push_back(Elt: Val);
5305 }
5306
5307 return Chain;
5308}
5309
5310static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
5311 const PPCSubtarget &Subtarget, bool isPatchPoint) {
5312 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5313 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5314
5315 // PatchPoint calls are not indirect.
5316 if (isPatchPoint)
5317 return false;
5318
5319 if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Val: Callee))
5320 return false;
5321
5322 // Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
5323 // becuase the immediate function pointer points to a descriptor instead of
5324 // a function entry point. The ELFv2 ABI cannot use a BLA because the function
5325 // pointer immediate points to the global entry point, while the BLA would
5326 // need to jump to the local entry point (see rL211174).
5327 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
5328 isBLACompatibleAddress(Op: Callee, DAG))
5329 return false;
5330
5331 return true;
5332}
5333
5334// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
5335static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
5336 return Subtarget.isAIXABI() ||
5337 (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
5338}
5339
5340static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
5341 const Function &Caller, const SDValue &Callee,
5342 const PPCSubtarget &Subtarget,
5343 const TargetMachine &TM,
5344 bool IsStrictFPCall = false) {
5345 if (CFlags.IsTailCall)
5346 return PPCISD::TC_RETURN;
5347
5348 unsigned RetOpc = 0;
5349 // This is a call through a function pointer.
5350 if (CFlags.IsIndirect) {
5351 // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
5352 // indirect calls. The save of the caller's TOC pointer to the stack will be
5353 // inserted into the DAG as part of call lowering. The restore of the TOC
5354 // pointer is modeled by using a pseudo instruction for the call opcode that
5355 // represents the 2 instruction sequence of an indirect branch and link,
5356 // immediately followed by a load of the TOC pointer from the stack save
5357 // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
5358 // as it is not saved or used.
5359 RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
5360 : PPCISD::BCTRL;
5361 } else if (Subtarget.isUsingPCRelativeCalls()) {
5362 assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
5363 RetOpc = PPCISD::CALL_NOTOC;
5364 } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) {
5365 // The ABIs that maintain a TOC pointer accross calls need to have a nop
5366 // immediately following the call instruction if the caller and callee may
5367 // have different TOC bases. At link time if the linker determines the calls
5368 // may not share a TOC base, the call is redirected to a trampoline inserted
5369 // by the linker. The trampoline will (among other things) save the callers
5370 // TOC pointer at an ABI designated offset in the linkage area and the
5371 // linker will rewrite the nop to be a load of the TOC pointer from the
5372 // linkage area into gpr2.
5373 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5374 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5375 RetOpc =
5376 callsShareTOCBase(Caller: &Caller, CalleeGV: GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP;
5377 } else
5378 RetOpc = PPCISD::CALL;
5379 if (IsStrictFPCall) {
5380 switch (RetOpc) {
5381 default:
5382 llvm_unreachable("Unknown call opcode");
5383 case PPCISD::BCTRL_LOAD_TOC:
5384 RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
5385 break;
5386 case PPCISD::BCTRL:
5387 RetOpc = PPCISD::BCTRL_RM;
5388 break;
5389 case PPCISD::CALL_NOTOC:
5390 RetOpc = PPCISD::CALL_NOTOC_RM;
5391 break;
5392 case PPCISD::CALL:
5393 RetOpc = PPCISD::CALL_RM;
5394 break;
5395 case PPCISD::CALL_NOP:
5396 RetOpc = PPCISD::CALL_NOP_RM;
5397 break;
5398 }
5399 }
5400 return RetOpc;
5401}
5402
5403static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
5404 const SDLoc &dl, const PPCSubtarget &Subtarget) {
5405 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
5406 if (SDNode *Dest = isBLACompatibleAddress(Op: Callee, DAG))
5407 return SDValue(Dest, 0);
5408
5409 // Returns true if the callee is local, and false otherwise.
5410 auto isLocalCallee = [&]() {
5411 const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5412 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5413
5414 return DAG.getTarget().shouldAssumeDSOLocal(GV) &&
5415 !isa_and_nonnull<GlobalIFunc>(Val: GV);
5416 };
5417
5418 // The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
5419 // a static relocation model causes some versions of GNU LD (2.17.50, at
5420 // least) to force BSS-PLT, instead of secure-PLT, even if all objects are
5421 // built with secure-PLT.
5422 bool UsePlt =
5423 Subtarget.is32BitELFABI() && !isLocalCallee() &&
5424 Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
5425
5426 const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
5427 const TargetMachine &TM = Subtarget.getTargetMachine();
5428 const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
5429 auto *S =
5430 static_cast<MCSymbolXCOFF *>(TLOF->getFunctionEntryPointSymbol(Func: GV, TM));
5431
5432 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
5433 return DAG.getMCSymbol(Sym: S, VT: PtrVT);
5434 };
5435
5436 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5437 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5438 if (isFunctionGlobalAddress(GV)) {
5439 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val: Callee)->getGlobal();
5440
5441 if (Subtarget.isAIXABI()) {
5442 return getAIXFuncEntryPointSymbolSDNode(GV);
5443 }
5444 return DAG.getTargetGlobalAddress(GV, DL: dl, VT: Callee.getValueType(), offset: 0,
5445 TargetFlags: UsePlt ? PPCII::MO_PLT : 0);
5446 }
5447
5448 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Val: Callee)) {
5449 const char *SymName = S->getSymbol();
5450 if (Subtarget.isAIXABI()) {
5451 // If there exists a user-declared function whose name is the same as the
5452 // ExternalSymbol's, then we pick up the user-declared version.
5453 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
5454 if (const Function *F =
5455 dyn_cast_or_null<Function>(Val: Mod->getNamedValue(Name: SymName)))
5456 return getAIXFuncEntryPointSymbolSDNode(F);
5457
5458 // On AIX, direct function calls reference the symbol for the function's
5459 // entry point, which is named by prepending a "." before the function's
5460 // C-linkage name. A Qualname is returned here because an external
5461 // function entry point is a csect with XTY_ER property.
5462 const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
5463 auto &Context = DAG.getMachineFunction().getContext();
5464 MCSectionXCOFF *Sec = Context.getXCOFFSection(
5465 Section: (Twine(".") + Twine(SymName)).str(), K: SectionKind::getMetadata(),
5466 CsectProp: XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
5467 return Sec->getQualNameSymbol();
5468 };
5469
5470 SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
5471 }
5472 return DAG.getTargetExternalSymbol(Sym: SymName, VT: Callee.getValueType(),
5473 TargetFlags: UsePlt ? PPCII::MO_PLT : 0);
5474 }
5475
5476 // No transformation needed.
5477 assert(Callee.getNode() && "What no callee?");
5478 return Callee;
5479}
5480
5481static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
5482 assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
5483 "Expected a CALLSEQ_STARTSDNode.");
5484
5485 // The last operand is the chain, except when the node has glue. If the node
5486 // has glue, then the last operand is the glue, and the chain is the second
5487 // last operand.
5488 SDValue LastValue = CallSeqStart.getValue(R: CallSeqStart->getNumValues() - 1);
5489 if (LastValue.getValueType() != MVT::Glue)
5490 return LastValue;
5491
5492 return CallSeqStart.getValue(R: CallSeqStart->getNumValues() - 2);
5493}
5494
5495// Creates the node that moves a functions address into the count register
5496// to prepare for an indirect call instruction.
5497static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5498 SDValue &Glue, SDValue &Chain,
5499 const SDLoc &dl) {
5500 SDValue MTCTROps[] = {Chain, Callee, Glue};
5501 EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
5502 Chain = DAG.getNode(Opcode: PPCISD::MTCTR, DL: dl, ResultTys: ReturnTypes,
5503 Ops: ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
5504 // The glue is the second value produced.
5505 Glue = Chain.getValue(R: 1);
5506}
5507
5508static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5509 SDValue &Glue, SDValue &Chain,
5510 SDValue CallSeqStart,
5511 const CallBase *CB, const SDLoc &dl,
5512 bool hasNest,
5513 const PPCSubtarget &Subtarget) {
5514 // Function pointers in the 64-bit SVR4 ABI do not point to the function
5515 // entry point, but to the function descriptor (the function entry point
5516 // address is part of the function descriptor though).
5517 // The function descriptor is a three doubleword structure with the
5518 // following fields: function entry point, TOC base address and
5519 // environment pointer.
5520 // Thus for a call through a function pointer, the following actions need
5521 // to be performed:
5522 // 1. Save the TOC of the caller in the TOC save area of its stack
5523 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
5524 // 2. Load the address of the function entry point from the function
5525 // descriptor.
5526 // 3. Load the TOC of the callee from the function descriptor into r2.
5527 // 4. Load the environment pointer from the function descriptor into
5528 // r11.
5529 // 5. Branch to the function entry point address.
5530 // 6. On return of the callee, the TOC of the caller needs to be
5531 // restored (this is done in FinishCall()).
5532 //
5533 // The loads are scheduled at the beginning of the call sequence, and the
5534 // register copies are flagged together to ensure that no other
5535 // operations can be scheduled in between. E.g. without flagging the
5536 // copies together, a TOC access in the caller could be scheduled between
5537 // the assignment of the callee TOC and the branch to the callee, which leads
5538 // to incorrect code.
5539
5540 // Start by loading the function address from the descriptor.
5541 SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
5542 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
5543 ? (MachineMemOperand::MODereferenceable |
5544 MachineMemOperand::MOInvariant)
5545 : MachineMemOperand::MONone;
5546
5547 MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
5548
5549 // Registers used in building the DAG.
5550 const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
5551 const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
5552
5553 // Offsets of descriptor members.
5554 const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
5555 const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
5556
5557 const MVT RegVT = Subtarget.getScalarIntVT();
5558 const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
5559
5560 // One load for the functions entry point address.
5561 SDValue LoadFuncPtr = DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: Callee, PtrInfo: MPI,
5562 Alignment, MMOFlags);
5563
5564 // One for loading the TOC anchor for the module that contains the called
5565 // function.
5566 SDValue TOCOff = DAG.getIntPtrConstant(Val: TOCAnchorOffset, DL: dl);
5567 SDValue AddTOC = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: Callee, N2: TOCOff);
5568 SDValue TOCPtr =
5569 DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: AddTOC,
5570 PtrInfo: MPI.getWithOffset(O: TOCAnchorOffset), Alignment, MMOFlags);
5571
5572 // One for loading the environment pointer.
5573 SDValue PtrOff = DAG.getIntPtrConstant(Val: EnvPtrOffset, DL: dl);
5574 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: Callee, N2: PtrOff);
5575 SDValue LoadEnvPtr =
5576 DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: AddPtr,
5577 PtrInfo: MPI.getWithOffset(O: EnvPtrOffset), Alignment, MMOFlags);
5578
5579
5580 // Then copy the newly loaded TOC anchor to the TOC pointer.
5581 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, Reg: TOCReg, N: TOCPtr, Glue);
5582 Chain = TOCVal.getValue(R: 0);
5583 Glue = TOCVal.getValue(R: 1);
5584
5585 // If the function call has an explicit 'nest' parameter, it takes the
5586 // place of the environment pointer.
5587 assert((!hasNest || !Subtarget.isAIXABI()) &&
5588 "Nest parameter is not supported on AIX.");
5589 if (!hasNest) {
5590 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, Reg: EnvPtrReg, N: LoadEnvPtr, Glue);
5591 Chain = EnvVal.getValue(R: 0);
5592 Glue = EnvVal.getValue(R: 1);
5593 }
5594
5595 // The rest of the indirect call sequence is the same as the non-descriptor
5596 // DAG.
5597 prepareIndirectCall(DAG, Callee&: LoadFuncPtr, Glue, Chain, dl);
5598}
5599
5600static void
5601buildCallOperands(SmallVectorImpl<SDValue> &Ops,
5602 PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
5603 SelectionDAG &DAG,
5604 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
5605 SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
5606 const PPCSubtarget &Subtarget) {
5607 const bool IsPPC64 = Subtarget.isPPC64();
5608 // MVT for a general purpose register.
5609 const MVT RegVT = Subtarget.getScalarIntVT();
5610
5611 // First operand is always the chain.
5612 Ops.push_back(Elt: Chain);
5613
5614 // If it's a direct call pass the callee as the second operand.
5615 if (!CFlags.IsIndirect)
5616 Ops.push_back(Elt: Callee);
5617 else {
5618 assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
5619
5620 // For the TOC based ABIs, we have saved the TOC pointer to the linkage area
5621 // on the stack (this would have been done in `LowerCall_64SVR4` or
5622 // `LowerCall_AIX`). The call instruction is a pseudo instruction that
5623 // represents both the indirect branch and a load that restores the TOC
5624 // pointer from the linkage area. The operand for the TOC restore is an add
5625 // of the TOC save offset to the stack pointer. This must be the second
5626 // operand: after the chain input but before any other variadic arguments.
5627 // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
5628 // saved or used.
5629 if (isTOCSaveRestoreRequired(Subtarget)) {
5630 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
5631
5632 SDValue StackPtr = DAG.getRegister(Reg: StackPtrReg, VT: RegVT);
5633 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
5634 SDValue TOCOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
5635 SDValue AddTOC = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: StackPtr, N2: TOCOff);
5636 Ops.push_back(Elt: AddTOC);
5637 }
5638
5639 // Add the register used for the environment pointer.
5640 if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
5641 Ops.push_back(Elt: DAG.getRegister(Reg: Subtarget.getEnvironmentPointerRegister(),
5642 VT: RegVT));
5643
5644
5645 // Add CTR register as callee so a bctr can be emitted later.
5646 if (CFlags.IsTailCall)
5647 Ops.push_back(Elt: DAG.getRegister(Reg: IsPPC64 ? PPC::CTR8 : PPC::CTR, VT: RegVT));
5648 }
5649
5650 // If this is a tail call add stack pointer delta.
5651 if (CFlags.IsTailCall)
5652 Ops.push_back(Elt: DAG.getConstant(Val: SPDiff, DL: dl, VT: MVT::i32));
5653
5654 // Add argument registers to the end of the list so that they are known live
5655 // into the call.
5656 for (const auto &[Reg, N] : RegsToPass)
5657 Ops.push_back(Elt: DAG.getRegister(Reg, VT: N.getValueType()));
5658
5659 // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
5660 // no way to mark dependencies as implicit here.
5661 // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
5662 if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
5663 !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
5664 Ops.push_back(Elt: DAG.getRegister(Reg: Subtarget.getTOCPointerRegister(), VT: RegVT));
5665
5666 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
5667 if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
5668 Ops.push_back(Elt: DAG.getRegister(Reg: PPC::CR1EQ, VT: MVT::i32));
5669
5670 // Add a register mask operand representing the call-preserved registers.
5671 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
5672 const uint32_t *Mask =
5673 TRI->getCallPreservedMask(MF: DAG.getMachineFunction(), CFlags.CallConv);
5674 assert(Mask && "Missing call preserved mask for calling convention");
5675 Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
5676
5677 // If the glue is valid, it is the last operand.
5678 if (Glue.getNode())
5679 Ops.push_back(Elt: Glue);
5680}
5681
5682SDValue PPCTargetLowering::FinishCall(
5683 CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
5684 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
5685 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
5686 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
5687 SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
5688
5689 if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
5690 Subtarget.isAIXABI())
5691 setUsesTOCBasePtr(DAG);
5692
5693 unsigned CallOpc =
5694 getCallOpcode(CFlags, Caller: DAG.getMachineFunction().getFunction(), Callee,
5695 Subtarget, TM: DAG.getTarget(), IsStrictFPCall: CB ? CB->isStrictFP() : false);
5696
5697 if (!CFlags.IsIndirect)
5698 Callee = transformCallee(Callee, DAG, dl, Subtarget);
5699 else if (Subtarget.usesFunctionDescriptors())
5700 prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
5701 dl, hasNest: CFlags.HasNest, Subtarget);
5702 else
5703 prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
5704
5705 // Build the operand list for the call instruction.
5706 SmallVector<SDValue, 8> Ops;
5707 buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
5708 SPDiff, Subtarget);
5709
5710 // Emit tail call.
5711 if (CFlags.IsTailCall) {
5712 // Indirect tail call when using PC Relative calls do not have the same
5713 // constraints.
5714 assert(((Callee.getOpcode() == ISD::Register &&
5715 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
5716 Callee.getOpcode() == ISD::TargetExternalSymbol ||
5717 Callee.getOpcode() == ISD::TargetGlobalAddress ||
5718 isa<ConstantSDNode>(Callee) ||
5719 (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
5720 "Expecting a global address, external symbol, absolute value, "
5721 "register or an indirect tail call when PC Relative calls are "
5722 "used.");
5723 // PC Relative calls also use TC_RETURN as the way to mark tail calls.
5724 assert(CallOpc == PPCISD::TC_RETURN &&
5725 "Unexpected call opcode for a tail call.");
5726 DAG.getMachineFunction().getFrameInfo().setHasTailCall();
5727 SDValue Ret = DAG.getNode(Opcode: CallOpc, DL: dl, VT: MVT::Other, Ops);
5728 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CFlags.NoMerge);
5729 return Ret;
5730 }
5731
5732 std::array<EVT, 2> ReturnTypes = {._M_elems: {MVT::Other, MVT::Glue}};
5733 Chain = DAG.getNode(Opcode: CallOpc, DL: dl, ResultTys: ReturnTypes, Ops);
5734 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CFlags.NoMerge);
5735 Glue = Chain.getValue(R: 1);
5736
5737 // When performing tail call optimization the callee pops its arguments off
5738 // the stack. Account for this here so these bytes can be pushed back on in
5739 // PPCFrameLowering::eliminateCallFramePseudoInstr.
5740 int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
5741 getTargetMachine().Options.GuaranteedTailCallOpt)
5742 ? NumBytes
5743 : 0;
5744
5745 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: BytesCalleePops, Glue, DL: dl);
5746 Glue = Chain.getValue(R: 1);
5747
5748 return LowerCallResult(Chain, InGlue: Glue, CallConv: CFlags.CallConv, isVarArg: CFlags.IsVarArg, Ins, dl,
5749 DAG, InVals);
5750}
5751
5752bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {
5753 CallingConv::ID CalleeCC = CB->getCallingConv();
5754 const Function *CallerFunc = CB->getCaller();
5755 CallingConv::ID CallerCC = CallerFunc->getCallingConv();
5756 const Function *CalleeFunc = CB->getCalledFunction();
5757 if (!CalleeFunc)
5758 return false;
5759 const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(Val: CalleeFunc);
5760
5761 SmallVector<ISD::OutputArg, 2> Outs;
5762 SmallVector<ISD::InputArg, 2> Ins;
5763
5764 GetReturnInfo(CC: CalleeCC, ReturnType: CalleeFunc->getReturnType(),
5765 attr: CalleeFunc->getAttributes(), Outs, TLI: *this,
5766 DL: CalleeFunc->getDataLayout());
5767
5768 return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,
5769 isVarArg: CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,
5770 isCalleeExternalSymbol: false /*isCalleeExternalSymbol*/);
5771}
5772
5773bool PPCTargetLowering::isEligibleForTCO(
5774 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5775 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5776 const SmallVectorImpl<ISD::OutputArg> &Outs,
5777 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5778 bool isCalleeExternalSymbol) const {
5779 if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
5780 return false;
5781
5782 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
5783 return IsEligibleForTailCallOptimization_64SVR4(
5784 CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc,
5785 isCalleeExternalSymbol);
5786 else
5787 return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC,
5788 isVarArg, Ins);
5789}
5790
5791SDValue
5792PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
5793 SmallVectorImpl<SDValue> &InVals) const {
5794 SelectionDAG &DAG = CLI.DAG;
5795 SDLoc &dl = CLI.DL;
5796 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
5797 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
5798 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
5799 SDValue Chain = CLI.Chain;
5800 SDValue Callee = CLI.Callee;
5801 bool &isTailCall = CLI.IsTailCall;
5802 CallingConv::ID CallConv = CLI.CallConv;
5803 bool isVarArg = CLI.IsVarArg;
5804 bool isPatchPoint = CLI.IsPatchPoint;
5805 const CallBase *CB = CLI.CB;
5806
5807 if (isTailCall) {
5808 MachineFunction &MF = DAG.getMachineFunction();
5809 CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
5810 auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
5811 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5812 bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Val: Callee);
5813
5814 isTailCall =
5815 isEligibleForTCO(CalleeGV: GV, CalleeCC: CallConv, CallerCC, CB, isVarArg, Outs, Ins,
5816 CallerFunc: &(MF.getFunction()), isCalleeExternalSymbol: IsCalleeExternalSymbol);
5817 if (isTailCall) {
5818 ++NumTailCalls;
5819 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5820 ++NumSiblingCalls;
5821
5822 // PC Relative calls no longer guarantee that the callee is a Global
5823 // Address Node. The callee could be an indirect tail call in which
5824 // case the SDValue for the callee could be a load (to load the address
5825 // of a function pointer) or it may be a register copy (to move the
5826 // address of the callee from a function parameter into a virtual
5827 // register). It may also be an ExternalSymbolSDNode (ex memcopy).
5828 assert((Subtarget.isUsingPCRelativeCalls() ||
5829 isa<GlobalAddressSDNode>(Callee)) &&
5830 "Callee should be an llvm::Function object.");
5831
5832 LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
5833 << "\nTCO callee: ");
5834 LLVM_DEBUG(Callee.dump());
5835 }
5836 }
5837
5838 if (!isTailCall && CB && CB->isMustTailCall())
5839 report_fatal_error(reason: "failed to perform tail call elimination on a call "
5840 "site marked musttail");
5841
5842 // When long calls (i.e. indirect calls) are always used, calls are always
5843 // made via function pointer. If we have a function name, first translate it
5844 // into a pointer.
5845 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Val: Callee) &&
5846 !isTailCall)
5847 Callee = LowerGlobalAddress(Op: Callee, DAG);
5848
5849 CallFlags CFlags(
5850 CallConv, isTailCall, isVarArg, isPatchPoint,
5851 isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
5852 // hasNest
5853 Subtarget.is64BitELFABI() &&
5854 any_of(Range&: Outs, P: [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
5855 CLI.NoMerge);
5856
5857 if (Subtarget.isAIXABI())
5858 return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5859 InVals, CB);
5860
5861 assert(Subtarget.isSVR4ABI());
5862 if (Subtarget.isPPC64())
5863 return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5864 InVals, CB);
5865 return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5866 InVals, CB);
5867}
5868
5869SDValue PPCTargetLowering::LowerCall_32SVR4(
5870 SDValue Chain, SDValue Callee, CallFlags CFlags,
5871 const SmallVectorImpl<ISD::OutputArg> &Outs,
5872 const SmallVectorImpl<SDValue> &OutVals,
5873 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5874 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
5875 const CallBase *CB) const {
5876 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
5877 // of the 32-bit SVR4 ABI stack frame layout.
5878
5879 const CallingConv::ID CallConv = CFlags.CallConv;
5880 const bool IsVarArg = CFlags.IsVarArg;
5881 const bool IsTailCall = CFlags.IsTailCall;
5882
5883 assert((CallConv == CallingConv::C ||
5884 CallConv == CallingConv::Cold ||
5885 CallConv == CallingConv::Fast) && "Unknown calling convention!");
5886
5887 const Align PtrAlign(4);
5888
5889 MachineFunction &MF = DAG.getMachineFunction();
5890
5891 // Mark this function as potentially containing a function that contains a
5892 // tail call. As a consequence the frame pointer will be used for dynamicalloc
5893 // and restoring the callers stack pointer in this functions epilog. This is
5894 // done because by tail calling the called function might overwrite the value
5895 // in this function's (MF) stack pointer stack slot 0(SP).
5896 if (getTargetMachine().Options.GuaranteedTailCallOpt &&
5897 CallConv == CallingConv::Fast)
5898 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
5899
5900 // Count how many bytes are to be pushed on the stack, including the linkage
5901 // area, parameter list area and the part of the local variable space which
5902 // contains copies of aggregates which are passed by value.
5903
5904 // Assign locations to all of the outgoing arguments.
5905 SmallVector<CCValAssign, 16> ArgLocs;
5906 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
5907
5908 // Reserve space for the linkage area on the stack.
5909 CCInfo.AllocateStack(Size: Subtarget.getFrameLowering()->getLinkageSize(),
5910 Alignment: PtrAlign);
5911
5912 if (IsVarArg) {
5913 // Handle fixed and variable vector arguments differently.
5914 // Fixed vector arguments go into registers as long as registers are
5915 // available. Variable vector arguments always go into memory.
5916 unsigned NumArgs = Outs.size();
5917
5918 for (unsigned i = 0; i != NumArgs; ++i) {
5919 MVT ArgVT = Outs[i].VT;
5920 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5921 bool Result;
5922
5923 if (!ArgFlags.isVarArg()) {
5924 Result = CC_PPC32_SVR4(ValNo: i, ValVT: ArgVT, LocVT: ArgVT, LocInfo: CCValAssign::Full, ArgFlags,
5925 OrigTy: Outs[i].OrigTy, State&: CCInfo);
5926 } else {
5927 Result = CC_PPC32_SVR4_VarArg(ValNo: i, ValVT: ArgVT, LocVT: ArgVT, LocInfo: CCValAssign::Full,
5928 ArgFlags, OrigTy: Outs[i].OrigTy, State&: CCInfo);
5929 }
5930
5931 if (Result) {
5932#ifndef NDEBUG
5933 errs() << "Call operand #" << i << " has unhandled type "
5934 << ArgVT << "\n";
5935#endif
5936 llvm_unreachable(nullptr);
5937 }
5938 }
5939 } else {
5940 // All arguments are treated the same.
5941 CCInfo.AnalyzeCallOperands(Outs, Fn: CC_PPC32_SVR4);
5942 }
5943
5944 // Assign locations to all of the outgoing aggregate by value arguments.
5945 SmallVector<CCValAssign, 16> ByValArgLocs;
5946 CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
5947
5948 // Reserve stack space for the allocations in CCInfo.
5949 CCByValInfo.AllocateStack(Size: CCInfo.getStackSize(), Alignment: PtrAlign);
5950
5951 CCByValInfo.AnalyzeCallOperands(Outs, Fn: CC_PPC32_SVR4_ByVal);
5952
5953 // Size of the linkage area, parameter list area and the part of the local
5954 // space variable where copies of aggregates which are passed by value are
5955 // stored.
5956 unsigned NumBytes = CCByValInfo.getStackSize();
5957
5958 // Calculate by how many bytes the stack has to be adjusted in case of tail
5959 // call optimization.
5960 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall: IsTailCall, ParamSize: NumBytes);
5961
5962 // Adjust the stack pointer for the new arguments...
5963 // These operations are automatically eliminated by the prolog/epilog pass
5964 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
5965 SDValue CallSeqStart = Chain;
5966
5967 // Load the return address and frame pointer so it can be moved somewhere else
5968 // later.
5969 SDValue LROp, FPOp;
5970 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROpOut&: LROp, FPOpOut&: FPOp, dl);
5971
5972 // Set up a copy of the stack pointer for use loading and storing any
5973 // arguments that may not fit in the registers available for argument
5974 // passing.
5975 SDValue StackPtr = DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
5976
5977 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
5978 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
5979 SmallVector<SDValue, 8> MemOpChains;
5980
5981 bool seenFloatArg = false;
5982 // Walk the register/memloc assignments, inserting copies/loads.
5983 // i - Tracks the index into the list of registers allocated for the call
5984 // RealArgIdx - Tracks the index into the list of actual function arguments
5985 // j - Tracks the index into the list of byval arguments
5986 for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
5987 i != e;
5988 ++i, ++RealArgIdx) {
5989 CCValAssign &VA = ArgLocs[i];
5990 SDValue Arg = OutVals[RealArgIdx];
5991 ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
5992
5993 if (Flags.isByVal()) {
5994 // Argument is an aggregate which is passed by value, thus we need to
5995 // create a copy of it in the local variable space of the current stack
5996 // frame (which is the stack frame of the caller) and pass the address of
5997 // this copy to the callee.
5998 assert((j < ByValArgLocs.size()) && "Index out of bounds!");
5999 CCValAssign &ByValVA = ByValArgLocs[j++];
6000 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
6001
6002 // Memory reserved in the local variable space of the callers stack frame.
6003 unsigned LocMemOffset = ByValVA.getLocMemOffset();
6004
6005 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
6006 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()),
6007 N1: StackPtr, N2: PtrOff);
6008
6009 // Create a copy of the argument in the local area of the current
6010 // stack frame.
6011 SDValue MemcpyCall =
6012 CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff,
6013 Chain: CallSeqStart.getNode()->getOperand(Num: 0),
6014 Flags, DAG, dl);
6015
6016 // This must go outside the CALLSEQ_START..END.
6017 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(Chain: MemcpyCall, InSize: NumBytes, OutSize: 0,
6018 DL: SDLoc(MemcpyCall));
6019 DAG.ReplaceAllUsesWith(From: CallSeqStart.getNode(),
6020 To: NewCallSeqStart.getNode());
6021 Chain = CallSeqStart = NewCallSeqStart;
6022
6023 // Pass the address of the aggregate copy on the stack either in a
6024 // physical register or in the parameter list area of the current stack
6025 // frame to the callee.
6026 Arg = PtrOff;
6027 }
6028
6029 // When useCRBits() is true, there can be i1 arguments.
6030 // It is because getRegisterType(MVT::i1) => MVT::i1,
6031 // and for other integer types getRegisterType() => MVT::i32.
6032 // Extend i1 and ensure callee will get i32.
6033 if (Arg.getValueType() == MVT::i1)
6034 Arg = DAG.getNode(Opcode: Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
6035 DL: dl, VT: MVT::i32, Operand: Arg);
6036
6037 if (VA.isRegLoc()) {
6038 seenFloatArg |= VA.getLocVT().isFloatingPoint();
6039 // Put argument in a physical register.
6040 if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
6041 bool IsLE = Subtarget.isLittleEndian();
6042 SDValue SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
6043 N2: DAG.getIntPtrConstant(Val: IsLE ? 0 : 1, DL: dl));
6044 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y: SVal.getValue(R: 0)));
6045 SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
6046 N2: DAG.getIntPtrConstant(Val: IsLE ? 1 : 0, DL: dl));
6047 RegsToPass.push_back(Elt: std::make_pair(x: ArgLocs[++i].getLocReg(),
6048 y: SVal.getValue(R: 0)));
6049 } else
6050 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
6051 } else {
6052 // Put argument in the parameter list area of the current stack frame.
6053 assert(VA.isMemLoc());
6054 unsigned LocMemOffset = VA.getLocMemOffset();
6055
6056 if (!IsTailCall) {
6057 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
6058 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()),
6059 N1: StackPtr, N2: PtrOff);
6060
6061 MemOpChains.push_back(
6062 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo()));
6063 } else {
6064 // Calculate and remember argument location.
6065 CalculateTailCallArgDest(DAG, MF, IsPPC64: false, Arg, SPDiff, ArgOffset: LocMemOffset,
6066 TailCallArguments);
6067 }
6068 }
6069 }
6070
6071 if (!MemOpChains.empty())
6072 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
6073
6074 // Build a sequence of copy-to-reg nodes chained together with token chain
6075 // and flag operands which copy the outgoing args into the appropriate regs.
6076 SDValue InGlue;
6077 for (const auto &[Reg, N] : RegsToPass) {
6078 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
6079 InGlue = Chain.getValue(R: 1);
6080 }
6081
6082 // Set CR bit 6 to true if this is a vararg call with floating args passed in
6083 // registers.
6084 if (IsVarArg) {
6085 SDVTList VTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6086 SDValue Ops[] = { Chain, InGlue };
6087
6088 Chain = DAG.getNode(Opcode: seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, DL: dl,
6089 VTList: VTs, Ops: ArrayRef(Ops, InGlue.getNode() ? 2 : 1));
6090
6091 InGlue = Chain.getValue(R: 1);
6092 }
6093
6094 if (IsTailCall)
6095 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6096 TailCallArguments);
6097
6098 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
6099 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6100}
6101
6102// Copy an argument into memory, being careful to do this outside the
6103// call sequence for the call to which the argument belongs.
6104SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
6105 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
6106 SelectionDAG &DAG, const SDLoc &dl) const {
6107 SDValue MemcpyCall = CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff,
6108 Chain: CallSeqStart.getNode()->getOperand(Num: 0),
6109 Flags, DAG, dl);
6110 // The MEMCPY must go outside the CALLSEQ_START..END.
6111 int64_t FrameSize = CallSeqStart.getConstantOperandVal(i: 1);
6112 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(Chain: MemcpyCall, InSize: FrameSize, OutSize: 0,
6113 DL: SDLoc(MemcpyCall));
6114 DAG.ReplaceAllUsesWith(From: CallSeqStart.getNode(),
6115 To: NewCallSeqStart.getNode());
6116 return NewCallSeqStart;
6117}
6118
6119SDValue PPCTargetLowering::LowerCall_64SVR4(
6120 SDValue Chain, SDValue Callee, CallFlags CFlags,
6121 const SmallVectorImpl<ISD::OutputArg> &Outs,
6122 const SmallVectorImpl<SDValue> &OutVals,
6123 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
6124 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
6125 const CallBase *CB) const {
6126 bool isELFv2ABI = Subtarget.isELFv2ABI();
6127 bool isLittleEndian = Subtarget.isLittleEndian();
6128 unsigned NumOps = Outs.size();
6129 bool IsSibCall = false;
6130 bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
6131
6132 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6133 unsigned PtrByteSize = 8;
6134
6135 MachineFunction &MF = DAG.getMachineFunction();
6136
6137 if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
6138 IsSibCall = true;
6139
6140 // Mark this function as potentially containing a function that contains a
6141 // tail call. As a consequence the frame pointer will be used for dynamicalloc
6142 // and restoring the callers stack pointer in this functions epilog. This is
6143 // done because by tail calling the called function might overwrite the value
6144 // in this function's (MF) stack pointer stack slot 0(SP).
6145 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6146 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
6147
6148 assert(!(IsFastCall && CFlags.IsVarArg) &&
6149 "fastcc not supported on varargs functions");
6150
6151 // Count how many bytes are to be pushed on the stack, including the linkage
6152 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
6153 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
6154 // area is 32 bytes reserved space for [SP][CR][LR][TOC].
6155 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
6156 unsigned NumBytes = LinkageSize;
6157 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
6158
6159 static const MCPhysReg GPR[] = {
6160 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6161 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
6162 };
6163 static const MCPhysReg VR[] = {
6164 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
6165 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
6166 };
6167
6168 const unsigned NumGPRs = std::size(GPR);
6169 const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
6170 const unsigned NumVRs = std::size(VR);
6171
6172 // On ELFv2, we can avoid allocating the parameter area if all the arguments
6173 // can be passed to the callee in registers.
6174 // For the fast calling convention, there is another check below.
6175 // Note: We should keep consistent with LowerFormalArguments_64SVR4()
6176 bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
6177 if (!HasParameterArea) {
6178 unsigned ParamAreaSize = NumGPRs * PtrByteSize;
6179 unsigned AvailableFPRs = NumFPRs;
6180 unsigned AvailableVRs = NumVRs;
6181 unsigned NumBytesTmp = NumBytes;
6182 for (unsigned i = 0; i != NumOps; ++i) {
6183 if (Outs[i].Flags.isNest()) continue;
6184 if (CalculateStackSlotUsed(ArgVT: Outs[i].VT, OrigVT: Outs[i].ArgVT, Flags: Outs[i].Flags,
6185 PtrByteSize, LinkageSize, ParamAreaSize,
6186 ArgOffset&: NumBytesTmp, AvailableFPRs, AvailableVRs))
6187 HasParameterArea = true;
6188 }
6189 }
6190
6191 // When using the fast calling convention, we don't provide backing for
6192 // arguments that will be in registers.
6193 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
6194
6195 // Avoid allocating parameter area for fastcc functions if all the arguments
6196 // can be passed in the registers.
6197 if (IsFastCall)
6198 HasParameterArea = false;
6199
6200 // Add up all the space actually used.
6201 for (unsigned i = 0; i != NumOps; ++i) {
6202 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6203 EVT ArgVT = Outs[i].VT;
6204 EVT OrigVT = Outs[i].ArgVT;
6205
6206 if (Flags.isNest())
6207 continue;
6208
6209 if (IsFastCall) {
6210 if (Flags.isByVal()) {
6211 NumGPRsUsed += (Flags.getByValSize()+7)/8;
6212 if (NumGPRsUsed > NumGPRs)
6213 HasParameterArea = true;
6214 } else {
6215 switch (ArgVT.getSimpleVT().SimpleTy) {
6216 default: llvm_unreachable("Unexpected ValueType for argument!");
6217 case MVT::i1:
6218 case MVT::i32:
6219 case MVT::i64:
6220 if (++NumGPRsUsed <= NumGPRs)
6221 continue;
6222 break;
6223 case MVT::v4i32:
6224 case MVT::v8i16:
6225 case MVT::v16i8:
6226 case MVT::v2f64:
6227 case MVT::v2i64:
6228 case MVT::v1i128:
6229 case MVT::f128:
6230 if (++NumVRsUsed <= NumVRs)
6231 continue;
6232 break;
6233 case MVT::v4f32:
6234 if (++NumVRsUsed <= NumVRs)
6235 continue;
6236 break;
6237 case MVT::f32:
6238 case MVT::f64:
6239 if (++NumFPRsUsed <= NumFPRs)
6240 continue;
6241 break;
6242 }
6243 HasParameterArea = true;
6244 }
6245 }
6246
6247 /* Respect alignment of argument on the stack. */
6248 auto Alignement =
6249 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6250 NumBytes = alignTo(Size: NumBytes, A: Alignement);
6251
6252 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
6253 if (Flags.isInConsecutiveRegsLast())
6254 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6255 }
6256
6257 unsigned NumBytesActuallyUsed = NumBytes;
6258
6259 // In the old ELFv1 ABI,
6260 // the prolog code of the callee may store up to 8 GPR argument registers to
6261 // the stack, allowing va_start to index over them in memory if its varargs.
6262 // Because we cannot tell if this is needed on the caller side, we have to
6263 // conservatively assume that it is needed. As such, make sure we have at
6264 // least enough stack space for the caller to store the 8 GPRs.
6265 // In the ELFv2 ABI, we allocate the parameter area iff a callee
6266 // really requires memory operands, e.g. a vararg function.
6267 if (HasParameterArea)
6268 NumBytes = std::max(a: NumBytes, b: LinkageSize + 8 * PtrByteSize);
6269 else
6270 NumBytes = LinkageSize;
6271
6272 // Tail call needs the stack to be aligned.
6273 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6274 NumBytes = EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes);
6275
6276 int SPDiff = 0;
6277
6278 // Calculate by how many bytes the stack has to be adjusted in case of tail
6279 // call optimization.
6280 if (!IsSibCall)
6281 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall: CFlags.IsTailCall, ParamSize: NumBytes);
6282
6283 // To protect arguments on the stack from being clobbered in a tail call,
6284 // force all the loads to happen before doing any other lowering.
6285 if (CFlags.IsTailCall)
6286 Chain = DAG.getStackArgumentTokenFactor(Chain);
6287
6288 // Adjust the stack pointer for the new arguments...
6289 // These operations are automatically eliminated by the prolog/epilog pass
6290 if (!IsSibCall)
6291 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
6292 SDValue CallSeqStart = Chain;
6293
6294 // Load the return address and frame pointer so it can be move somewhere else
6295 // later.
6296 SDValue LROp, FPOp;
6297 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROpOut&: LROp, FPOpOut&: FPOp, dl);
6298
6299 // Set up a copy of the stack pointer for use loading and storing any
6300 // arguments that may not fit in the registers available for argument
6301 // passing.
6302 SDValue StackPtr = DAG.getRegister(Reg: PPC::X1, VT: MVT::i64);
6303
6304 // Figure out which arguments are going to go in registers, and which in
6305 // memory. Also, if this is a vararg function, floating point operations
6306 // must be stored to our stack, and loaded into integer regs as well, if
6307 // any integer regs are available for argument passing.
6308 unsigned ArgOffset = LinkageSize;
6309
6310 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6311 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6312
6313 SmallVector<SDValue, 8> MemOpChains;
6314 for (unsigned i = 0; i != NumOps; ++i) {
6315 SDValue Arg = OutVals[i];
6316 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6317 EVT ArgVT = Outs[i].VT;
6318 EVT OrigVT = Outs[i].ArgVT;
6319
6320 // PtrOff will be used to store the current argument to the stack if a
6321 // register cannot be found for it.
6322 SDValue PtrOff;
6323
6324 // We re-align the argument offset for each argument, except when using the
6325 // fast calling convention, when we need to make sure we do that only when
6326 // we'll actually use a stack slot.
6327 auto ComputePtrOff = [&]() {
6328 /* Respect alignment of argument on the stack. */
6329 auto Alignment =
6330 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6331 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
6332
6333 PtrOff = DAG.getConstant(Val: ArgOffset, DL: dl, VT: StackPtr.getValueType());
6334
6335 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
6336 };
6337
6338 if (!IsFastCall) {
6339 ComputePtrOff();
6340
6341 /* Compute GPR index associated with argument offset. */
6342 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
6343 GPR_idx = std::min(a: GPR_idx, b: NumGPRs);
6344 }
6345
6346 // Promote integers to 64-bit values.
6347 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
6348 // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
6349 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6350 Arg = DAG.getNode(Opcode: ExtOp, DL: dl, VT: MVT::i64, Operand: Arg);
6351 }
6352
6353 // FIXME memcpy is used way more than necessary. Correctness first.
6354 // Note: "by value" is code for passing a structure by value, not
6355 // basic types.
6356 if (Flags.isByVal()) {
6357 // Note: Size includes alignment padding, so
6358 // struct x { short a; char b; }
6359 // will have Size = 4. With #pragma pack(1), it will have Size = 3.
6360 // These are the proper values we need for right-justifying the
6361 // aggregate in a parameter register.
6362 unsigned Size = Flags.getByValSize();
6363
6364 // An empty aggregate parameter takes up no storage and no
6365 // registers.
6366 if (Size == 0)
6367 continue;
6368
6369 if (IsFastCall)
6370 ComputePtrOff();
6371
6372 // All aggregates smaller than 8 bytes must be passed right-justified.
6373 if (Size==1 || Size==2 || Size==4) {
6374 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
6375 if (GPR_idx != NumGPRs) {
6376 SDValue Load = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: PtrVT, Chain, Ptr: Arg,
6377 PtrInfo: MachinePointerInfo(), MemVT: VT);
6378 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6379 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6380
6381 ArgOffset += PtrByteSize;
6382 continue;
6383 }
6384 }
6385
6386 if (GPR_idx == NumGPRs && Size < 8) {
6387 SDValue AddPtr = PtrOff;
6388 if (!isLittleEndian) {
6389 SDValue Const = DAG.getConstant(Val: PtrByteSize - Size, DL: dl,
6390 VT: PtrOff.getValueType());
6391 AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: Const);
6392 }
6393 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff: AddPtr,
6394 CallSeqStart,
6395 Flags, DAG, dl);
6396 ArgOffset += PtrByteSize;
6397 continue;
6398 }
6399 // Copy the object to parameter save area if it can not be entirely passed
6400 // by registers.
6401 // FIXME: we only need to copy the parts which need to be passed in
6402 // parameter save area. For the parts passed by registers, we don't need
6403 // to copy them to the stack although we need to allocate space for them
6404 // in parameter save area.
6405 if ((NumGPRs - GPR_idx) * PtrByteSize < Size)
6406 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
6407 CallSeqStart,
6408 Flags, DAG, dl);
6409
6410 // When a register is available, pass a small aggregate right-justified.
6411 if (Size < 8 && GPR_idx != NumGPRs) {
6412 // The easiest way to get this right-justified in a register
6413 // is to copy the structure into the rightmost portion of a
6414 // local variable slot, then load the whole slot into the
6415 // register.
6416 // FIXME: The memcpy seems to produce pretty awful code for
6417 // small aggregates, particularly for packed ones.
6418 // FIXME: It would be preferable to use the slot in the
6419 // parameter save area instead of a new local variable.
6420 SDValue AddPtr = PtrOff;
6421 if (!isLittleEndian) {
6422 SDValue Const = DAG.getConstant(Val: 8 - Size, DL: dl, VT: PtrOff.getValueType());
6423 AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: Const);
6424 }
6425 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff: AddPtr,
6426 CallSeqStart,
6427 Flags, DAG, dl);
6428
6429 // Load the slot into the register.
6430 SDValue Load =
6431 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6432 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6433 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6434
6435 // Done with this argument.
6436 ArgOffset += PtrByteSize;
6437 continue;
6438 }
6439
6440 // For aggregates larger than PtrByteSize, copy the pieces of the
6441 // object that fit into registers from the parameter save area.
6442 for (unsigned j=0; j<Size; j+=PtrByteSize) {
6443 SDValue Const = DAG.getConstant(Val: j, DL: dl, VT: PtrOff.getValueType());
6444 SDValue AddArg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg, N2: Const);
6445 if (GPR_idx != NumGPRs) {
6446 unsigned LoadSizeInBits = std::min(a: PtrByteSize, b: (Size - j)) * 8;
6447 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LoadSizeInBits);
6448 SDValue Load = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: PtrVT, Chain, Ptr: AddArg,
6449 PtrInfo: MachinePointerInfo(), MemVT: ObjType);
6450
6451 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6452 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6453 ArgOffset += PtrByteSize;
6454 } else {
6455 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
6456 break;
6457 }
6458 }
6459 continue;
6460 }
6461
6462 switch (Arg.getSimpleValueType().SimpleTy) {
6463 default: llvm_unreachable("Unexpected ValueType for argument!");
6464 case MVT::i1:
6465 case MVT::i32:
6466 case MVT::i64:
6467 if (Flags.isNest()) {
6468 // The 'nest' parameter, if any, is passed in R11.
6469 RegsToPass.push_back(Elt: std::make_pair(x: PPC::X11, y&: Arg));
6470 break;
6471 }
6472
6473 // These can be scalar arguments or elements of an integer array type
6474 // passed directly. Clang may use those instead of "byval" aggregate
6475 // types to avoid forcing arguments to memory unnecessarily.
6476 if (GPR_idx != NumGPRs) {
6477 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Arg));
6478 } else {
6479 if (IsFastCall)
6480 ComputePtrOff();
6481
6482 assert(HasParameterArea &&
6483 "Parameter area must exist to pass an argument in memory.");
6484 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6485 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: false, MemOpChains,
6486 TailCallArguments, dl);
6487 if (IsFastCall)
6488 ArgOffset += PtrByteSize;
6489 }
6490 if (!IsFastCall)
6491 ArgOffset += PtrByteSize;
6492 break;
6493 case MVT::f32:
6494 case MVT::f64: {
6495 // These can be scalar arguments or elements of a float array type
6496 // passed directly. The latter are used to implement ELFv2 homogenous
6497 // float aggregates.
6498
6499 // Named arguments go into FPRs first, and once they overflow, the
6500 // remaining arguments go into GPRs and then the parameter save area.
6501 // Unnamed arguments for vararg functions always go to GPRs and
6502 // then the parameter save area. For now, put all arguments to vararg
6503 // routines always in both locations (FPR *and* GPR or stack slot).
6504 bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
6505 bool NeededLoad = false;
6506
6507 // First load the argument into the next available FPR.
6508 if (FPR_idx != NumFPRs)
6509 RegsToPass.push_back(Elt: std::make_pair(x: FPR[FPR_idx++], y&: Arg));
6510
6511 // Next, load the argument into GPR or stack slot if needed.
6512 if (!NeedGPROrStack)
6513 ;
6514 else if (GPR_idx != NumGPRs && !IsFastCall) {
6515 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
6516 // once we support fp <-> gpr moves.
6517
6518 // In the non-vararg case, this can only ever happen in the
6519 // presence of f32 array types, since otherwise we never run
6520 // out of FPRs before running out of GPRs.
6521 SDValue ArgVal;
6522
6523 // Double values are always passed in a single GPR.
6524 if (Arg.getValueType() != MVT::f32) {
6525 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: Arg);
6526
6527 // Non-array float values are extended and passed in a GPR.
6528 } else if (!Flags.isInConsecutiveRegs()) {
6529 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6530 ArgVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: ArgVal);
6531
6532 // If we have an array of floats, we collect every odd element
6533 // together with its predecessor into one GPR.
6534 } else if (ArgOffset % PtrByteSize != 0) {
6535 SDValue Lo, Hi;
6536 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: OutVals[i - 1]);
6537 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6538 if (!isLittleEndian)
6539 std::swap(a&: Lo, b&: Hi);
6540 ArgVal = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Lo, N2: Hi);
6541
6542 // The final element, if even, goes into the first half of a GPR.
6543 } else if (Flags.isInConsecutiveRegsLast()) {
6544 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6545 ArgVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: ArgVal);
6546 if (!isLittleEndian)
6547 ArgVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: ArgVal,
6548 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
6549
6550 // Non-final even elements are skipped; they will be handled
6551 // together the with subsequent argument on the next go-around.
6552 } else
6553 ArgVal = SDValue();
6554
6555 if (ArgVal.getNode())
6556 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: ArgVal));
6557 } else {
6558 if (IsFastCall)
6559 ComputePtrOff();
6560
6561 // Single-precision floating-point values are mapped to the
6562 // second (rightmost) word of the stack doubleword.
6563 if (Arg.getValueType() == MVT::f32 &&
6564 !isLittleEndian && !Flags.isInConsecutiveRegs()) {
6565 SDValue ConstFour = DAG.getConstant(Val: 4, DL: dl, VT: PtrOff.getValueType());
6566 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: ConstFour);
6567 }
6568
6569 assert(HasParameterArea &&
6570 "Parameter area must exist to pass an argument in memory.");
6571 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6572 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: false, MemOpChains,
6573 TailCallArguments, dl);
6574
6575 NeededLoad = true;
6576 }
6577 // When passing an array of floats, the array occupies consecutive
6578 // space in the argument area; only round up to the next doubleword
6579 // at the end of the array. Otherwise, each float takes 8 bytes.
6580 if (!IsFastCall || NeededLoad) {
6581 ArgOffset += (Arg.getValueType() == MVT::f32 &&
6582 Flags.isInConsecutiveRegs()) ? 4 : 8;
6583 if (Flags.isInConsecutiveRegsLast())
6584 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6585 }
6586 break;
6587 }
6588 case MVT::v4f32:
6589 case MVT::v4i32:
6590 case MVT::v8i16:
6591 case MVT::v16i8:
6592 case MVT::v2f64:
6593 case MVT::v2i64:
6594 case MVT::v1i128:
6595 case MVT::f128:
6596 // These can be scalar arguments or elements of a vector array type
6597 // passed directly. The latter are used to implement ELFv2 homogenous
6598 // vector aggregates.
6599
6600 // For a varargs call, named arguments go into VRs or on the stack as
6601 // usual; unnamed arguments always go to the stack or the corresponding
6602 // GPRs when within range. For now, we always put the value in both
6603 // locations (or even all three).
6604 if (CFlags.IsVarArg) {
6605 assert(HasParameterArea &&
6606 "Parameter area must exist if we have a varargs call.");
6607 // We could elide this store in the case where the object fits
6608 // entirely in R registers. Maybe later.
6609 SDValue Store =
6610 DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6611 MemOpChains.push_back(Elt: Store);
6612 if (VR_idx != NumVRs) {
6613 SDValue Load =
6614 DAG.getLoad(VT: MVT::v4f32, dl, Chain: Store, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6615 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6616 RegsToPass.push_back(Elt: std::make_pair(x: VR[VR_idx++], y&: Load));
6617 }
6618 ArgOffset += 16;
6619 for (unsigned i=0; i<16; i+=PtrByteSize) {
6620 if (GPR_idx == NumGPRs)
6621 break;
6622 SDValue Ix = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff,
6623 N2: DAG.getConstant(Val: i, DL: dl, VT: PtrVT));
6624 SDValue Load =
6625 DAG.getLoad(VT: PtrVT, dl, Chain: Store, Ptr: Ix, PtrInfo: MachinePointerInfo());
6626 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6627 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6628 }
6629 break;
6630 }
6631
6632 // Non-varargs Altivec params go into VRs or on the stack.
6633 if (VR_idx != NumVRs) {
6634 RegsToPass.push_back(Elt: std::make_pair(x: VR[VR_idx++], y&: Arg));
6635 } else {
6636 if (IsFastCall)
6637 ComputePtrOff();
6638
6639 assert(HasParameterArea &&
6640 "Parameter area must exist to pass an argument in memory.");
6641 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6642 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: true, MemOpChains,
6643 TailCallArguments, dl);
6644 if (IsFastCall)
6645 ArgOffset += 16;
6646 }
6647
6648 if (!IsFastCall)
6649 ArgOffset += 16;
6650 break;
6651 }
6652 }
6653
6654 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
6655 "mismatch in size of parameter area");
6656 (void)NumBytesActuallyUsed;
6657
6658 if (!MemOpChains.empty())
6659 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
6660
6661 // Check if this is an indirect call (MTCTR/BCTRL).
6662 // See prepareDescriptorIndirectCall and buildCallOperands for more
6663 // information about calls through function pointers in the 64-bit SVR4 ABI.
6664 if (CFlags.IsIndirect) {
6665 // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
6666 // caller in the TOC save area.
6667 if (isTOCSaveRestoreRequired(Subtarget)) {
6668 assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
6669 // Load r2 into a virtual register and store it to the TOC save area.
6670 setUsesTOCBasePtr(DAG);
6671 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: PPC::X2, VT: MVT::i64);
6672 // TOC save area offset.
6673 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
6674 SDValue PtrOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
6675 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
6676 Chain = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: AddPtr,
6677 PtrInfo: MachinePointerInfo::getStack(
6678 MF&: DAG.getMachineFunction(), Offset: TOCSaveOffset));
6679 }
6680 // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
6681 // This does not mean the MTCTR instruction must use R12; it's easier
6682 // to model this as an extra parameter, so do that.
6683 if (isELFv2ABI && !CFlags.IsPatchPoint)
6684 RegsToPass.push_back(Elt: std::make_pair(x: (unsigned)PPC::X12, y&: Callee));
6685 }
6686
6687 // Build a sequence of copy-to-reg nodes chained together with token chain
6688 // and flag operands which copy the outgoing args into the appropriate regs.
6689 SDValue InGlue;
6690 for (const auto &[Reg, N] : RegsToPass) {
6691 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
6692 InGlue = Chain.getValue(R: 1);
6693 }
6694
6695 if (CFlags.IsTailCall && !IsSibCall)
6696 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6697 TailCallArguments);
6698
6699 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
6700 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6701}
6702
6703// Returns true when the shadow of a general purpose argument register
6704// in the parameter save area is aligned to at least 'RequiredAlign'.
6705static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
6706 assert(RequiredAlign.value() <= 16 &&
6707 "Required alignment greater than stack alignment.");
6708 switch (Reg) {
6709 default:
6710 report_fatal_error(reason: "called on invalid register.");
6711 case PPC::R5:
6712 case PPC::R9:
6713 case PPC::X3:
6714 case PPC::X5:
6715 case PPC::X7:
6716 case PPC::X9:
6717 // These registers are 16 byte aligned which is the most strict aligment
6718 // we can support.
6719 return true;
6720 case PPC::R3:
6721 case PPC::R7:
6722 case PPC::X4:
6723 case PPC::X6:
6724 case PPC::X8:
6725 case PPC::X10:
6726 // The shadow of these registers in the PSA is 8 byte aligned.
6727 return RequiredAlign <= 8;
6728 case PPC::R4:
6729 case PPC::R6:
6730 case PPC::R8:
6731 case PPC::R10:
6732 return RequiredAlign <= 4;
6733 }
6734}
6735
6736static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
6737 CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
6738 Type *OrigTy, CCState &State) {
6739 const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
6740 State.getMachineFunction().getSubtarget());
6741 const bool IsPPC64 = Subtarget.isPPC64();
6742 const unsigned PtrSize = IsPPC64 ? 8 : 4;
6743 const Align PtrAlign(PtrSize);
6744 const Align StackAlign(16);
6745 const MVT RegVT = Subtarget.getScalarIntVT();
6746
6747 if (ValVT == MVT::f128)
6748 report_fatal_error(reason: "f128 is unimplemented on AIX.");
6749
6750 static const MCPhysReg GPR_32[] = {// 32-bit registers.
6751 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
6752 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
6753 static const MCPhysReg GPR_64[] = {// 64-bit registers.
6754 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6755 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
6756
6757 static const MCPhysReg VR[] = {// Vector registers.
6758 PPC::V2, PPC::V3, PPC::V4, PPC::V5,
6759 PPC::V6, PPC::V7, PPC::V8, PPC::V9,
6760 PPC::V10, PPC::V11, PPC::V12, PPC::V13};
6761
6762 const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
6763
6764 if (ArgFlags.isNest()) {
6765 MCRegister EnvReg = State.AllocateReg(Reg: IsPPC64 ? PPC::X11 : PPC::R11);
6766 if (!EnvReg)
6767 report_fatal_error(reason: "More then one nest argument.");
6768 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: EnvReg, LocVT: RegVT, HTP: LocInfo));
6769 return false;
6770 }
6771
6772 if (ArgFlags.isByVal()) {
6773 const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
6774 if (ByValAlign > StackAlign)
6775 report_fatal_error(reason: "Pass-by-value arguments with alignment greater than "
6776 "16 are not supported.");
6777
6778 const unsigned ByValSize = ArgFlags.getByValSize();
6779 const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;
6780
6781 // An empty aggregate parameter takes up no storage and no registers,
6782 // but needs a MemLoc for a stack slot for the formal arguments side.
6783 if (ByValSize == 0) {
6784 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6785 Offset: State.getStackSize(), LocVT: RegVT, HTP: LocInfo));
6786 return false;
6787 }
6788
6789 // Shadow allocate any registers that are not properly aligned.
6790 unsigned NextReg = State.getFirstUnallocated(Regs: GPRs);
6791 while (NextReg != GPRs.size() &&
6792 !isGPRShadowAligned(Reg: GPRs[NextReg], RequiredAlign: ObjAlign)) {
6793 // Shadow allocate next registers since its aligment is not strict enough.
6794 MCRegister Reg = State.AllocateReg(Regs: GPRs);
6795 // Allocate the stack space shadowed by said register.
6796 State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6797 assert(Reg && "Alocating register unexpectedly failed.");
6798 (void)Reg;
6799 NextReg = State.getFirstUnallocated(Regs: GPRs);
6800 }
6801
6802 const unsigned StackSize = alignTo(Size: ByValSize, A: ObjAlign);
6803 unsigned Offset = State.AllocateStack(Size: StackSize, Alignment: ObjAlign);
6804 for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
6805 if (MCRegister Reg = State.AllocateReg(Regs: GPRs))
6806 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6807 else {
6808 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6809 Offset, LocVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6810 HTP: LocInfo));
6811 break;
6812 }
6813 }
6814 return false;
6815 }
6816
6817 // Arguments always reserve parameter save area.
6818 switch (ValVT.SimpleTy) {
6819 default:
6820 report_fatal_error(reason: "Unhandled value type for argument.");
6821 case MVT::i64:
6822 // i64 arguments should have been split to i32 for PPC32.
6823 assert(IsPPC64 && "PPC32 should have split i64 values.");
6824 [[fallthrough]];
6825 case MVT::i1:
6826 case MVT::i32: {
6827 const unsigned Offset = State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6828 // AIX integer arguments are always passed in register width.
6829 if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
6830 LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
6831 : CCValAssign::LocInfo::ZExt;
6832 if (MCRegister Reg = State.AllocateReg(Regs: GPRs))
6833 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6834 else
6835 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT: RegVT, HTP: LocInfo));
6836
6837 return false;
6838 }
6839 case MVT::f32:
6840 case MVT::f64: {
6841 // Parameter save area (PSA) is reserved even if the float passes in fpr.
6842 const unsigned StoreSize = LocVT.getStoreSize();
6843 // Floats are always 4-byte aligned in the PSA on AIX.
6844 // This includes f64 in 64-bit mode for ABI compatibility.
6845 const unsigned Offset =
6846 State.AllocateStack(Size: IsPPC64 ? 8 : StoreSize, Alignment: Align(4));
6847 MCRegister FReg = State.AllocateReg(Regs: FPR);
6848 if (FReg)
6849 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: FReg, LocVT, HTP: LocInfo));
6850
6851 // Reserve and initialize GPRs or initialize the PSA as required.
6852 for (unsigned I = 0; I < StoreSize; I += PtrSize) {
6853 if (MCRegister Reg = State.AllocateReg(Regs: GPRs)) {
6854 assert(FReg && "An FPR should be available when a GPR is reserved.");
6855 if (State.isVarArg()) {
6856 // Successfully reserved GPRs are only initialized for vararg calls.
6857 // Custom handling is required for:
6858 // f64 in PPC32 needs to be split into 2 GPRs.
6859 // f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
6860 State.addLoc(
6861 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6862 }
6863 } else {
6864 // If there are insufficient GPRs, the PSA needs to be initialized.
6865 // Initialization occurs even if an FPR was initialized for
6866 // compatibility with the AIX XL compiler. The full memory for the
6867 // argument will be initialized even if a prior word is saved in GPR.
6868 // A custom memLoc is used when the argument also passes in FPR so
6869 // that the callee handling can skip over it easily.
6870 State.addLoc(
6871 V: FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
6872 HTP: LocInfo)
6873 : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6874 break;
6875 }
6876 }
6877
6878 return false;
6879 }
6880 case MVT::v4f32:
6881 case MVT::v4i32:
6882 case MVT::v8i16:
6883 case MVT::v16i8:
6884 case MVT::v2i64:
6885 case MVT::v2f64:
6886 case MVT::v1i128: {
6887 const unsigned VecSize = 16;
6888 const Align VecAlign(VecSize);
6889
6890 if (!State.isVarArg()) {
6891 // If there are vector registers remaining we don't consume any stack
6892 // space.
6893 if (MCRegister VReg = State.AllocateReg(Regs: VR)) {
6894 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: VReg, LocVT, HTP: LocInfo));
6895 return false;
6896 }
6897 // Vectors passed on the stack do not shadow GPRs or FPRs even though they
6898 // might be allocated in the portion of the PSA that is shadowed by the
6899 // GPRs.
6900 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6901 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6902 return false;
6903 }
6904
6905 unsigned NextRegIndex = State.getFirstUnallocated(Regs: GPRs);
6906 // Burn any underaligned registers and their shadowed stack space until
6907 // we reach the required alignment.
6908 while (NextRegIndex != GPRs.size() &&
6909 !isGPRShadowAligned(Reg: GPRs[NextRegIndex], RequiredAlign: VecAlign)) {
6910 // Shadow allocate register and its stack shadow.
6911 MCRegister Reg = State.AllocateReg(Regs: GPRs);
6912 State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6913 assert(Reg && "Allocating register unexpectedly failed.");
6914 (void)Reg;
6915 NextRegIndex = State.getFirstUnallocated(Regs: GPRs);
6916 }
6917
6918 // Vectors that are passed as fixed arguments are handled differently.
6919 // They are passed in VRs if any are available (unlike arguments passed
6920 // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
6921 // functions)
6922 if (!ArgFlags.isVarArg()) {
6923 if (MCRegister VReg = State.AllocateReg(Regs: VR)) {
6924 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: VReg, LocVT, HTP: LocInfo));
6925 // Shadow allocate GPRs and stack space even though we pass in a VR.
6926 for (unsigned I = 0; I != VecSize; I += PtrSize)
6927 State.AllocateReg(Regs: GPRs);
6928 State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6929 return false;
6930 }
6931 // No vector registers remain so pass on the stack.
6932 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6933 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6934 return false;
6935 }
6936
6937 // If all GPRS are consumed then we pass the argument fully on the stack.
6938 if (NextRegIndex == GPRs.size()) {
6939 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6940 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6941 return false;
6942 }
6943
6944 // Corner case for 32-bit codegen. We have 2 registers to pass the first
6945 // half of the argument, and then need to pass the remaining half on the
6946 // stack.
6947 if (GPRs[NextRegIndex] == PPC::R9) {
6948 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6949 State.addLoc(
6950 V: CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6951
6952 const MCRegister FirstReg = State.AllocateReg(Reg: PPC::R9);
6953 const MCRegister SecondReg = State.AllocateReg(Reg: PPC::R10);
6954 assert(FirstReg && SecondReg &&
6955 "Allocating R9 or R10 unexpectedly failed.");
6956 State.addLoc(
6957 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg: FirstReg, LocVT: RegVT, HTP: LocInfo));
6958 State.addLoc(
6959 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg: SecondReg, LocVT: RegVT, HTP: LocInfo));
6960 return false;
6961 }
6962
6963 // We have enough GPRs to fully pass the vector argument, and we have
6964 // already consumed any underaligned registers. Start with the custom
6965 // MemLoc and then the custom RegLocs.
6966 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6967 State.addLoc(
6968 V: CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6969 for (unsigned I = 0; I != VecSize; I += PtrSize) {
6970 const MCRegister Reg = State.AllocateReg(Regs: GPRs);
6971 assert(Reg && "Failed to allocated register for vararg vector argument");
6972 State.addLoc(
6973 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6974 }
6975 return false;
6976 }
6977 }
6978 return true;
6979}
6980
6981// So far, this function is only used by LowerFormalArguments_AIX()
6982static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
6983 bool IsPPC64,
6984 bool HasP8Vector,
6985 bool HasVSX) {
6986 assert((IsPPC64 || SVT != MVT::i64) &&
6987 "i64 should have been split for 32-bit codegen.");
6988
6989 switch (SVT) {
6990 default:
6991 report_fatal_error(reason: "Unexpected value type for formal argument");
6992 case MVT::i1:
6993 case MVT::i32:
6994 case MVT::i64:
6995 return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
6996 case MVT::f32:
6997 return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;
6998 case MVT::f64:
6999 return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;
7000 case MVT::v4f32:
7001 case MVT::v4i32:
7002 case MVT::v8i16:
7003 case MVT::v16i8:
7004 case MVT::v2i64:
7005 case MVT::v2f64:
7006 case MVT::v1i128:
7007 return &PPC::VRRCRegClass;
7008 }
7009}
7010
7011static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
7012 SelectionDAG &DAG, SDValue ArgValue,
7013 MVT LocVT, const SDLoc &dl) {
7014 assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
7015 assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
7016
7017 if (Flags.isSExt())
7018 ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: LocVT, N1: ArgValue,
7019 N2: DAG.getValueType(ValVT));
7020 else if (Flags.isZExt())
7021 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: LocVT, N1: ArgValue,
7022 N2: DAG.getValueType(ValVT));
7023
7024 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ValVT, Operand: ArgValue);
7025}
7026
7027static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
7028 const unsigned LASize = FL->getLinkageSize();
7029
7030 if (PPC::GPRCRegClass.contains(Reg)) {
7031 assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
7032 "Reg must be a valid argument register!");
7033 return LASize + 4 * (Reg - PPC::R3);
7034 }
7035
7036 if (PPC::G8RCRegClass.contains(Reg)) {
7037 assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
7038 "Reg must be a valid argument register!");
7039 return LASize + 8 * (Reg - PPC::X3);
7040 }
7041
7042 llvm_unreachable("Only general purpose registers expected.");
7043}
7044
7045// AIX ABI Stack Frame Layout:
7046//
7047// Low Memory +--------------------------------------------+
7048// SP +---> | Back chain | ---+
7049// | +--------------------------------------------+ |
7050// | | Saved Condition Register | |
7051// | +--------------------------------------------+ |
7052// | | Saved Linkage Register | |
7053// | +--------------------------------------------+ | Linkage Area
7054// | | Reserved for compilers | |
7055// | +--------------------------------------------+ |
7056// | | Reserved for binders | |
7057// | +--------------------------------------------+ |
7058// | | Saved TOC pointer | ---+
7059// | +--------------------------------------------+
7060// | | Parameter save area |
7061// | +--------------------------------------------+
7062// | | Alloca space |
7063// | +--------------------------------------------+
7064// | | Local variable space |
7065// | +--------------------------------------------+
7066// | | Float/int conversion temporary |
7067// | +--------------------------------------------+
7068// | | Save area for AltiVec registers |
7069// | +--------------------------------------------+
7070// | | AltiVec alignment padding |
7071// | +--------------------------------------------+
7072// | | Save area for VRSAVE register |
7073// | +--------------------------------------------+
7074// | | Save area for General Purpose registers |
7075// | +--------------------------------------------+
7076// | | Save area for Floating Point registers |
7077// | +--------------------------------------------+
7078// +---- | Back chain |
7079// High Memory +--------------------------------------------+
7080//
7081// Specifications:
7082// AIX 7.2 Assembler Language Reference
7083// Subroutine linkage convention
7084
7085SDValue PPCTargetLowering::LowerFormalArguments_AIX(
7086 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7087 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7088 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7089
7090 assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
7091 CallConv == CallingConv::Fast) &&
7092 "Unexpected calling convention!");
7093
7094 if (getTargetMachine().Options.GuaranteedTailCallOpt)
7095 report_fatal_error(reason: "Tail call support is unimplemented on AIX.");
7096
7097 if (useSoftFloat())
7098 report_fatal_error(reason: "Soft float support is unimplemented on AIX.");
7099
7100 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7101
7102 const bool IsPPC64 = Subtarget.isPPC64();
7103 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7104
7105 // Assign locations to all of the incoming arguments.
7106 SmallVector<CCValAssign, 16> ArgLocs;
7107 MachineFunction &MF = DAG.getMachineFunction();
7108 MachineFrameInfo &MFI = MF.getFrameInfo();
7109 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
7110 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7111
7112 const EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7113 // Reserve space for the linkage area on the stack.
7114 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7115 CCInfo.AllocateStack(Size: LinkageSize, Alignment: Align(PtrByteSize));
7116 uint64_t SaveStackPos = CCInfo.getStackSize();
7117 bool SaveParams = MF.getFunction().hasFnAttribute(Kind: "save-reg-params");
7118 CCInfo.AnalyzeFormalArguments(Ins, Fn: CC_AIX);
7119
7120 SmallVector<SDValue, 8> MemOps;
7121
7122 for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
7123 CCValAssign &VA = ArgLocs[I++];
7124 MVT LocVT = VA.getLocVT();
7125 MVT ValVT = VA.getValVT();
7126 ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
7127
7128 EVT ArgVT = Ins[VA.getValNo()].ArgVT;
7129 bool ArgSignExt = Ins[VA.getValNo()].Flags.isSExt();
7130 // For compatibility with the AIX XL compiler, the float args in the
7131 // parameter save area are initialized even if the argument is available
7132 // in register. The caller is required to initialize both the register
7133 // and memory, however, the callee can choose to expect it in either.
7134 // The memloc is dismissed here because the argument is retrieved from
7135 // the register.
7136 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
7137 continue;
7138
7139 if (SaveParams && VA.isRegLoc() && !Flags.isByVal() && !VA.needsCustom()) {
7140 const TargetRegisterClass *RegClass = getRegClassForSVT(
7141 SVT: LocVT.SimpleTy, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(), HasVSX: Subtarget.hasVSX());
7142 // On PPC64, debugger assumes extended 8-byte values are stored from GPR.
7143 MVT SaveVT = RegClass == &PPC::G8RCRegClass ? MVT::i64 : LocVT;
7144 const Register VReg = MF.addLiveIn(PReg: VA.getLocReg(), RC: RegClass);
7145 SDValue Parm = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: SaveVT);
7146 int FI = MFI.CreateFixedObject(Size: SaveVT.getStoreSize(), SPOffset: SaveStackPos, IsImmutable: true);
7147 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7148 SDValue StoreReg = DAG.getStore(Chain, dl, Val: Parm, Ptr: FIN,
7149 PtrInfo: MachinePointerInfo(), Alignment: Align(PtrByteSize));
7150 SaveStackPos = alignTo(Value: SaveStackPos + SaveVT.getStoreSize(), Align: PtrByteSize);
7151 MemOps.push_back(Elt: StoreReg);
7152 }
7153
7154 if (SaveParams && (VA.isMemLoc() || Flags.isByVal()) && !VA.needsCustom()) {
7155 unsigned StoreSize =
7156 Flags.isByVal() ? Flags.getByValSize() : LocVT.getStoreSize();
7157 SaveStackPos = alignTo(Value: SaveStackPos + StoreSize, Align: PtrByteSize);
7158 }
7159
7160 auto HandleMemLoc = [&]() {
7161 const unsigned LocSize = LocVT.getStoreSize();
7162 const unsigned ValSize = ValVT.getStoreSize();
7163 assert((ValSize <= LocSize) &&
7164 "Object size is larger than size of MemLoc");
7165 int CurArgOffset = VA.getLocMemOffset();
7166 // Objects are right-justified because AIX is big-endian.
7167 if (LocSize > ValSize)
7168 CurArgOffset += LocSize - ValSize;
7169 // Potential tail calls could cause overwriting of argument stack slots.
7170 const bool IsImmutable =
7171 !(getTargetMachine().Options.GuaranteedTailCallOpt &&
7172 (CallConv == CallingConv::Fast));
7173 int FI = MFI.CreateFixedObject(Size: ValSize, SPOffset: CurArgOffset, IsImmutable);
7174 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7175 SDValue ArgValue =
7176 DAG.getLoad(VT: ValVT, dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo());
7177
7178 // While the ABI specifies the argument type is (sign or zero) extended
7179 // out to register width, not all code is compliant. We truncate and
7180 // re-extend to be more forgiving of these callers when the argument type
7181 // is smaller than register width.
7182 if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
7183 ValVT.isInteger() &&
7184 ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
7185 // It is possible to have either real integer values
7186 // or integers that were not originally integers.
7187 // In the latter case, these could have came from structs,
7188 // and these integers would not have an extend on the parameter.
7189 // Since these types of integers do not have an extend specified
7190 // in the first place, the type of extend that we do should not matter.
7191 EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1
7192 ? MVT::i8
7193 : ArgVT;
7194 SDValue ArgValueTrunc =
7195 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: TruncatedArgVT, Operand: ArgValue);
7196 SDValue ArgValueExt =
7197 ArgSignExt ? DAG.getSExtOrTrunc(Op: ArgValueTrunc, DL: dl, VT: ValVT)
7198 : DAG.getZExtOrTrunc(Op: ArgValueTrunc, DL: dl, VT: ValVT);
7199 InVals.push_back(Elt: ArgValueExt);
7200 } else {
7201 InVals.push_back(Elt: ArgValue);
7202 }
7203 };
7204
7205 // Vector arguments to VaArg functions are passed both on the stack, and
7206 // in any available GPRs. Load the value from the stack and add the GPRs
7207 // as live ins.
7208 if (VA.isMemLoc() && VA.needsCustom()) {
7209 assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");
7210 assert(isVarArg && "Only use custom memloc for vararg.");
7211 // ValNo of the custom MemLoc, so we can compare it to the ValNo of the
7212 // matching custom RegLocs.
7213 const unsigned OriginalValNo = VA.getValNo();
7214 (void)OriginalValNo;
7215
7216 auto HandleCustomVecRegLoc = [&]() {
7217 assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7218 "Missing custom RegLoc.");
7219 VA = ArgLocs[I++];
7220 assert(VA.getValVT().isVector() &&
7221 "Unexpected Val type for custom RegLoc.");
7222 assert(VA.getValNo() == OriginalValNo &&
7223 "ValNo mismatch between custom MemLoc and RegLoc.");
7224 MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;
7225 MF.addLiveIn(PReg: VA.getLocReg(),
7226 RC: getRegClassForSVT(SVT, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(),
7227 HasVSX: Subtarget.hasVSX()));
7228 };
7229
7230 HandleMemLoc();
7231 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7232 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7233 // R10.
7234 HandleCustomVecRegLoc();
7235 HandleCustomVecRegLoc();
7236
7237 // If we are targeting 32-bit, there might be 2 extra custom RegLocs if
7238 // we passed the vector in R5, R6, R7 and R8.
7239 if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {
7240 assert(!IsPPC64 &&
7241 "Only 2 custom RegLocs expected for 64-bit codegen.");
7242 HandleCustomVecRegLoc();
7243 HandleCustomVecRegLoc();
7244 }
7245
7246 continue;
7247 }
7248
7249 if (VA.isRegLoc()) {
7250 if (VA.getValVT().isScalarInteger())
7251 FuncInfo->appendParameterType(Type: PPCFunctionInfo::FixedType);
7252 else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {
7253 switch (VA.getValVT().SimpleTy) {
7254 default:
7255 report_fatal_error(reason: "Unhandled value type for argument.");
7256 case MVT::f32:
7257 FuncInfo->appendParameterType(Type: PPCFunctionInfo::ShortFloatingPoint);
7258 break;
7259 case MVT::f64:
7260 FuncInfo->appendParameterType(Type: PPCFunctionInfo::LongFloatingPoint);
7261 break;
7262 }
7263 } else if (VA.getValVT().isVector()) {
7264 switch (VA.getValVT().SimpleTy) {
7265 default:
7266 report_fatal_error(reason: "Unhandled value type for argument.");
7267 case MVT::v16i8:
7268 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorChar);
7269 break;
7270 case MVT::v8i16:
7271 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorShort);
7272 break;
7273 case MVT::v4i32:
7274 case MVT::v2i64:
7275 case MVT::v1i128:
7276 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorInt);
7277 break;
7278 case MVT::v4f32:
7279 case MVT::v2f64:
7280 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorFloat);
7281 break;
7282 }
7283 }
7284 }
7285
7286 if (Flags.isByVal() && VA.isMemLoc()) {
7287 const unsigned Size =
7288 alignTo(Value: Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
7289 Align: PtrByteSize);
7290 const int FI = MF.getFrameInfo().CreateFixedObject(
7291 Size, SPOffset: VA.getLocMemOffset(), /* IsImmutable */ false,
7292 /* IsAliased */ isAliased: true);
7293 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7294 InVals.push_back(Elt: FIN);
7295
7296 continue;
7297 }
7298
7299 if (Flags.isByVal()) {
7300 assert(VA.isRegLoc() && "MemLocs should already be handled.");
7301
7302 const MCPhysReg ArgReg = VA.getLocReg();
7303 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
7304
7305 const unsigned StackSize = alignTo(Value: Flags.getByValSize(), Align: PtrByteSize);
7306 const int FI = MF.getFrameInfo().CreateFixedObject(
7307 Size: StackSize, SPOffset: mapArgRegToOffsetAIX(Reg: ArgReg, FL), /* IsImmutable */ false,
7308 /* IsAliased */ isAliased: true);
7309 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7310 InVals.push_back(Elt: FIN);
7311
7312 // Add live ins for all the RegLocs for the same ByVal.
7313 const TargetRegisterClass *RegClass =
7314 IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7315
7316 auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
7317 unsigned Offset) {
7318 const Register VReg = MF.addLiveIn(PReg: PhysReg, RC: RegClass);
7319 // Since the callers side has left justified the aggregate in the
7320 // register, we can simply store the entire register into the stack
7321 // slot.
7322 SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: LocVT);
7323 // The store to the fixedstack object is needed becuase accessing a
7324 // field of the ByVal will use a gep and load. Ideally we will optimize
7325 // to extracting the value from the register directly, and elide the
7326 // stores when the arguments address is not taken, but that will need to
7327 // be future work.
7328 SDValue Store = DAG.getStore(
7329 Chain: CopyFrom.getValue(R: 1), dl, Val: CopyFrom,
7330 Ptr: DAG.getObjectPtrOffset(SL: dl, Ptr: FIN, Offset: TypeSize::getFixed(ExactSize: Offset)),
7331 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI, Offset));
7332
7333 MemOps.push_back(Elt: Store);
7334 };
7335
7336 unsigned Offset = 0;
7337 HandleRegLoc(VA.getLocReg(), Offset);
7338 Offset += PtrByteSize;
7339 for (; Offset != StackSize && ArgLocs[I].isRegLoc();
7340 Offset += PtrByteSize) {
7341 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7342 "RegLocs should be for ByVal argument.");
7343
7344 const CCValAssign RL = ArgLocs[I++];
7345 HandleRegLoc(RL.getLocReg(), Offset);
7346 FuncInfo->appendParameterType(Type: PPCFunctionInfo::FixedType);
7347 }
7348
7349 if (Offset != StackSize) {
7350 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7351 "Expected MemLoc for remaining bytes.");
7352 assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
7353 // Consume the MemLoc.The InVal has already been emitted, so nothing
7354 // more needs to be done.
7355 ++I;
7356 }
7357
7358 continue;
7359 }
7360
7361 if (VA.isRegLoc() && !VA.needsCustom()) {
7362 MVT::SimpleValueType SVT = ValVT.SimpleTy;
7363 Register VReg =
7364 MF.addLiveIn(PReg: VA.getLocReg(),
7365 RC: getRegClassForSVT(SVT, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(),
7366 HasVSX: Subtarget.hasVSX()));
7367 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: LocVT);
7368 if (ValVT.isScalarInteger() &&
7369 (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
7370 ArgValue =
7371 truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
7372 }
7373 InVals.push_back(Elt: ArgValue);
7374 continue;
7375 }
7376 if (VA.isMemLoc()) {
7377 HandleMemLoc();
7378 continue;
7379 }
7380 }
7381
7382 // On AIX a minimum of 8 words is saved to the parameter save area.
7383 const unsigned MinParameterSaveArea = 8 * PtrByteSize;
7384 // Area that is at least reserved in the caller of this function.
7385 unsigned CallerReservedArea = std::max<unsigned>(
7386 a: CCInfo.getStackSize(), b: LinkageSize + MinParameterSaveArea);
7387
7388 // Set the size that is at least reserved in caller of this function. Tail
7389 // call optimized function's reserved stack space needs to be aligned so
7390 // that taking the difference between two stack areas will result in an
7391 // aligned stack.
7392 CallerReservedArea =
7393 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: CallerReservedArea);
7394 FuncInfo->setMinReservedArea(CallerReservedArea);
7395
7396 if (isVarArg) {
7397 int VAListIndex = 0;
7398 // If any of the optional arguments are passed in register then the fixed
7399 // stack object we spill into is not immutable. Create a fixed stack object
7400 // that overlaps the remainder of the parameter save area.
7401 if (CCInfo.getStackSize() < (LinkageSize + MinParameterSaveArea)) {
7402 unsigned FixedStackSize =
7403 LinkageSize + MinParameterSaveArea - CCInfo.getStackSize();
7404 VAListIndex =
7405 MFI.CreateFixedObject(Size: FixedStackSize, SPOffset: CCInfo.getStackSize(),
7406 /* IsImmutable */ false, /* IsAliased */ isAliased: true);
7407 } else {
7408 // All the arguments passed through ellipses are on the stack. Create a
7409 // dummy fixed stack object the same size as a pointer since we don't
7410 // know the actual size.
7411 VAListIndex =
7412 MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: CCInfo.getStackSize(),
7413 /* IsImmutable */ true, /* IsAliased */ isAliased: true);
7414 }
7415
7416 FuncInfo->setVarArgsFrameIndex(VAListIndex);
7417 SDValue FIN = DAG.getFrameIndex(FI: VAListIndex, VT: PtrVT);
7418
7419 static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
7420 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
7421
7422 static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
7423 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
7424 const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32);
7425
7426 // The fixed integer arguments of a variadic function are stored to the
7427 // VarArgsFrameIndex on the stack so that they may be loaded by
7428 // dereferencing the result of va_next.
7429 for (unsigned
7430 GPRIndex = (CCInfo.getStackSize() - LinkageSize) / PtrByteSize,
7431 Offset = 0;
7432 GPRIndex < NumGPArgRegs; ++GPRIndex, Offset += PtrByteSize) {
7433
7434 const Register VReg =
7435 IsPPC64 ? MF.addLiveIn(PReg: GPR_64[GPRIndex], RC: &PPC::G8RCRegClass)
7436 : MF.addLiveIn(PReg: GPR_32[GPRIndex], RC: &PPC::GPRCRegClass);
7437
7438 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
7439 MachinePointerInfo MPI =
7440 MachinePointerInfo::getFixedStack(MF, FI: VAListIndex, Offset);
7441 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MPI);
7442 MemOps.push_back(Elt: Store);
7443 // Increment the address for the next argument to store.
7444 SDValue PtrOff = DAG.getConstant(Val: PtrByteSize, DL: dl, VT: PtrVT);
7445 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
7446 }
7447 }
7448
7449 if (!MemOps.empty())
7450 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
7451
7452 return Chain;
7453}
7454
7455SDValue PPCTargetLowering::LowerCall_AIX(
7456 SDValue Chain, SDValue Callee, CallFlags CFlags,
7457 const SmallVectorImpl<ISD::OutputArg> &Outs,
7458 const SmallVectorImpl<SDValue> &OutVals,
7459 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7460 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
7461 const CallBase *CB) const {
7462 // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
7463 // AIX ABI stack frame layout.
7464
7465 assert((CFlags.CallConv == CallingConv::C ||
7466 CFlags.CallConv == CallingConv::Cold ||
7467 CFlags.CallConv == CallingConv::Fast) &&
7468 "Unexpected calling convention!");
7469
7470 if (CFlags.IsPatchPoint)
7471 report_fatal_error(reason: "This call type is unimplemented on AIX.");
7472
7473 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7474
7475 MachineFunction &MF = DAG.getMachineFunction();
7476 SmallVector<CCValAssign, 16> ArgLocs;
7477 CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
7478 *DAG.getContext());
7479
7480 // Reserve space for the linkage save area (LSA) on the stack.
7481 // In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
7482 // [SP][CR][LR][2 x reserved][TOC].
7483 // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
7484 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7485 const bool IsPPC64 = Subtarget.isPPC64();
7486 const EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7487 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7488 CCInfo.AllocateStack(Size: LinkageSize, Alignment: Align(PtrByteSize));
7489 CCInfo.AnalyzeCallOperands(Outs, Fn: CC_AIX);
7490
7491 // The prolog code of the callee may store up to 8 GPR argument registers to
7492 // the stack, allowing va_start to index over them in memory if the callee
7493 // is variadic.
7494 // Because we cannot tell if this is needed on the caller side, we have to
7495 // conservatively assume that it is needed. As such, make sure we have at
7496 // least enough stack space for the caller to store the 8 GPRs.
7497 const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
7498 const unsigned NumBytes = std::max<unsigned>(
7499 a: LinkageSize + MinParameterSaveAreaSize, b: CCInfo.getStackSize());
7500
7501 // Adjust the stack pointer for the new arguments...
7502 // These operations are automatically eliminated by the prolog/epilog pass.
7503 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
7504 SDValue CallSeqStart = Chain;
7505
7506 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7507 SmallVector<SDValue, 8> MemOpChains;
7508
7509 // Set up a copy of the stack pointer for loading and storing any
7510 // arguments that may not fit in the registers available for argument
7511 // passing.
7512 const SDValue StackPtr = IsPPC64 ? DAG.getRegister(Reg: PPC::X1, VT: MVT::i64)
7513 : DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
7514
7515 for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
7516 const unsigned ValNo = ArgLocs[I].getValNo();
7517 SDValue Arg = OutVals[ValNo];
7518 ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
7519
7520 if (Flags.isByVal()) {
7521 const unsigned ByValSize = Flags.getByValSize();
7522
7523 // Nothing to do for zero-sized ByVals on the caller side.
7524 if (!ByValSize) {
7525 ++I;
7526 continue;
7527 }
7528
7529 auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
7530 return DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: PtrVT, Chain,
7531 Ptr: (LoadOffset != 0)
7532 ? DAG.getObjectPtrOffset(
7533 SL: dl, Ptr: Arg, Offset: TypeSize::getFixed(ExactSize: LoadOffset))
7534 : Arg,
7535 PtrInfo: MachinePointerInfo(), MemVT: VT);
7536 };
7537
7538 unsigned LoadOffset = 0;
7539
7540 // Initialize registers, which are fully occupied by the by-val argument.
7541 while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
7542 SDValue Load = GetLoad(PtrVT, LoadOffset);
7543 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7544 LoadOffset += PtrByteSize;
7545 const CCValAssign &ByValVA = ArgLocs[I++];
7546 assert(ByValVA.getValNo() == ValNo &&
7547 "Unexpected location for pass-by-value argument.");
7548 RegsToPass.push_back(Elt: std::make_pair(x: ByValVA.getLocReg(), y&: Load));
7549 }
7550
7551 if (LoadOffset == ByValSize)
7552 continue;
7553
7554 // There must be one more loc to handle the remainder.
7555 assert(ArgLocs[I].getValNo() == ValNo &&
7556 "Expected additional location for by-value argument.");
7557
7558 if (ArgLocs[I].isMemLoc()) {
7559 assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
7560 const CCValAssign &ByValVA = ArgLocs[I++];
7561 ISD::ArgFlagsTy MemcpyFlags = Flags;
7562 // Only memcpy the bytes that don't pass in register.
7563 MemcpyFlags.setByValSize(ByValSize - LoadOffset);
7564 Chain = CallSeqStart = createMemcpyOutsideCallSeq(
7565 Arg: (LoadOffset != 0) ? DAG.getObjectPtrOffset(
7566 SL: dl, Ptr: Arg, Offset: TypeSize::getFixed(ExactSize: LoadOffset))
7567 : Arg,
7568 PtrOff: DAG.getObjectPtrOffset(
7569 SL: dl, Ptr: StackPtr, Offset: TypeSize::getFixed(ExactSize: ByValVA.getLocMemOffset())),
7570 CallSeqStart, Flags: MemcpyFlags, DAG, dl);
7571 continue;
7572 }
7573
7574 // Initialize the final register residue.
7575 // Any residue that occupies the final by-val arg register must be
7576 // left-justified on AIX. Loads must be a power-of-2 size and cannot be
7577 // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
7578 // 2 and 1 byte loads.
7579 const unsigned ResidueBytes = ByValSize % PtrByteSize;
7580 assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
7581 "Unexpected register residue for by-value argument.");
7582 SDValue ResidueVal;
7583 for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
7584 const unsigned N = llvm::bit_floor(Value: ResidueBytes - Bytes);
7585 const MVT VT =
7586 N == 1 ? MVT::i8
7587 : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
7588 SDValue Load = GetLoad(VT, LoadOffset);
7589 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7590 LoadOffset += N;
7591 Bytes += N;
7592
7593 // By-val arguments are passed left-justfied in register.
7594 // Every load here needs to be shifted, otherwise a full register load
7595 // should have been used.
7596 assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
7597 "Unexpected load emitted during handling of pass-by-value "
7598 "argument.");
7599 unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
7600 EVT ShiftAmountTy =
7601 getShiftAmountTy(LHSTy: Load->getValueType(ResNo: 0), DL: DAG.getDataLayout());
7602 SDValue SHLAmt = DAG.getConstant(Val: NumSHLBits, DL: dl, VT: ShiftAmountTy);
7603 SDValue ShiftedLoad =
7604 DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: Load.getValueType(), N1: Load, N2: SHLAmt);
7605 ResidueVal = ResidueVal ? DAG.getNode(Opcode: ISD::OR, DL: dl, VT: PtrVT, N1: ResidueVal,
7606 N2: ShiftedLoad)
7607 : ShiftedLoad;
7608 }
7609
7610 const CCValAssign &ByValVA = ArgLocs[I++];
7611 RegsToPass.push_back(Elt: std::make_pair(x: ByValVA.getLocReg(), y&: ResidueVal));
7612 continue;
7613 }
7614
7615 CCValAssign &VA = ArgLocs[I++];
7616 const MVT LocVT = VA.getLocVT();
7617 const MVT ValVT = VA.getValVT();
7618
7619 switch (VA.getLocInfo()) {
7620 default:
7621 report_fatal_error(reason: "Unexpected argument extension type.");
7622 case CCValAssign::Full:
7623 break;
7624 case CCValAssign::ZExt:
7625 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7626 break;
7627 case CCValAssign::SExt:
7628 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7629 break;
7630 }
7631
7632 if (VA.isRegLoc() && !VA.needsCustom()) {
7633 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
7634 continue;
7635 }
7636
7637 // Vector arguments passed to VarArg functions need custom handling when
7638 // they are passed (at least partially) in GPRs.
7639 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {
7640 assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");
7641 // Store value to its stack slot.
7642 SDValue PtrOff =
7643 DAG.getConstant(Val: VA.getLocMemOffset(), DL: dl, VT: StackPtr.getValueType());
7644 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7645 SDValue Store =
7646 DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
7647 MemOpChains.push_back(Elt: Store);
7648 const unsigned OriginalValNo = VA.getValNo();
7649 // Then load the GPRs from the stack
7650 unsigned LoadOffset = 0;
7651 auto HandleCustomVecRegLoc = [&]() {
7652 assert(I != E && "Unexpected end of CCvalAssigns.");
7653 assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7654 "Expected custom RegLoc.");
7655 CCValAssign RegVA = ArgLocs[I++];
7656 assert(RegVA.getValNo() == OriginalValNo &&
7657 "Custom MemLoc ValNo and custom RegLoc ValNo must match.");
7658 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff,
7659 N2: DAG.getConstant(Val: LoadOffset, DL: dl, VT: PtrVT));
7660 SDValue Load = DAG.getLoad(VT: PtrVT, dl, Chain: Store, Ptr: Add, PtrInfo: MachinePointerInfo());
7661 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7662 RegsToPass.push_back(Elt: std::make_pair(x: RegVA.getLocReg(), y&: Load));
7663 LoadOffset += PtrByteSize;
7664 };
7665
7666 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7667 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7668 // R10.
7669 HandleCustomVecRegLoc();
7670 HandleCustomVecRegLoc();
7671
7672 if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7673 ArgLocs[I].getValNo() == OriginalValNo) {
7674 assert(!IsPPC64 &&
7675 "Only 2 custom RegLocs expected for 64-bit codegen.");
7676 HandleCustomVecRegLoc();
7677 HandleCustomVecRegLoc();
7678 }
7679
7680 continue;
7681 }
7682
7683 if (VA.isMemLoc()) {
7684 SDValue PtrOff =
7685 DAG.getConstant(Val: VA.getLocMemOffset(), DL: dl, VT: StackPtr.getValueType());
7686 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7687 MemOpChains.push_back(
7688 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff,
7689 PtrInfo: MachinePointerInfo::getStack(MF, Offset: VA.getLocMemOffset()),
7690 Alignment: Subtarget.getFrameLowering()->getStackAlign()));
7691
7692 continue;
7693 }
7694
7695 if (!ValVT.isFloatingPoint())
7696 report_fatal_error(
7697 reason: "Unexpected register handling for calling convention.");
7698
7699 // Custom handling is used for GPR initializations for vararg float
7700 // arguments.
7701 assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
7702 LocVT.isInteger() &&
7703 "Custom register handling only expected for VarArg.");
7704
7705 SDValue ArgAsInt =
7706 DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: ValVT.getSizeInBits()), V: Arg);
7707
7708 if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
7709 // f32 in 32-bit GPR
7710 // f64 in 64-bit GPR
7711 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ArgAsInt));
7712 else if (Arg.getValueType().getFixedSizeInBits() <
7713 LocVT.getFixedSizeInBits())
7714 // f32 in 64-bit GPR.
7715 RegsToPass.push_back(Elt: std::make_pair(
7716 x: VA.getLocReg(), y: DAG.getZExtOrTrunc(Op: ArgAsInt, DL: dl, VT: LocVT)));
7717 else {
7718 // f64 in two 32-bit GPRs
7719 // The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
7720 assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
7721 "Unexpected custom register for argument!");
7722 CCValAssign &GPR1 = VA;
7723 SDValue MSWAsI64 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: ArgAsInt,
7724 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i8));
7725 RegsToPass.push_back(Elt: std::make_pair(
7726 x: GPR1.getLocReg(), y: DAG.getZExtOrTrunc(Op: MSWAsI64, DL: dl, VT: MVT::i32)));
7727
7728 if (I != E) {
7729 // If only 1 GPR was available, there will only be one custom GPR and
7730 // the argument will also pass in memory.
7731 CCValAssign &PeekArg = ArgLocs[I];
7732 if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
7733 assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
7734 CCValAssign &GPR2 = ArgLocs[I++];
7735 RegsToPass.push_back(Elt: std::make_pair(
7736 x: GPR2.getLocReg(), y: DAG.getZExtOrTrunc(Op: ArgAsInt, DL: dl, VT: MVT::i32)));
7737 }
7738 }
7739 }
7740 }
7741
7742 if (!MemOpChains.empty())
7743 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
7744
7745 // For indirect calls, we need to save the TOC base to the stack for
7746 // restoration after the call.
7747 if (CFlags.IsIndirect) {
7748 assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
7749 const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
7750 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
7751 const MVT PtrVT = Subtarget.getScalarIntVT();
7752 const unsigned TOCSaveOffset =
7753 Subtarget.getFrameLowering()->getTOCSaveOffset();
7754
7755 setUsesTOCBasePtr(DAG);
7756 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: TOCBaseReg, VT: PtrVT);
7757 SDValue PtrOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
7758 SDValue StackPtr = DAG.getRegister(Reg: StackPtrReg, VT: PtrVT);
7759 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7760 Chain = DAG.getStore(
7761 Chain: Val.getValue(R: 1), dl, Val, Ptr: AddPtr,
7762 PtrInfo: MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset: TOCSaveOffset));
7763 }
7764
7765 // Build a sequence of copy-to-reg nodes chained together with token chain
7766 // and flag operands which copy the outgoing args into the appropriate regs.
7767 SDValue InGlue;
7768 for (auto Reg : RegsToPass) {
7769 Chain = DAG.getCopyToReg(Chain, dl, Reg: Reg.first, N: Reg.second, Glue: InGlue);
7770 InGlue = Chain.getValue(R: 1);
7771 }
7772
7773 const int SPDiff = 0;
7774 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
7775 Callee, SPDiff, NumBytes, Ins, InVals, CB);
7776}
7777
7778bool
7779PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
7780 MachineFunction &MF, bool isVarArg,
7781 const SmallVectorImpl<ISD::OutputArg> &Outs,
7782 LLVMContext &Context,
7783 const Type *RetTy) const {
7784 SmallVector<CCValAssign, 16> RVLocs;
7785 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7786 return CCInfo.CheckReturn(
7787 Outs, Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7788 ? RetCC_PPC_Cold
7789 : RetCC_PPC);
7790}
7791
7792SDValue
7793PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7794 bool isVarArg,
7795 const SmallVectorImpl<ISD::OutputArg> &Outs,
7796 const SmallVectorImpl<SDValue> &OutVals,
7797 const SDLoc &dl, SelectionDAG &DAG) const {
7798 SmallVector<CCValAssign, 16> RVLocs;
7799 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
7800 *DAG.getContext());
7801 CCInfo.AnalyzeReturn(Outs,
7802 Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7803 ? RetCC_PPC_Cold
7804 : RetCC_PPC);
7805
7806 SDValue Glue;
7807 SmallVector<SDValue, 4> RetOps(1, Chain);
7808
7809 // Copy the result values into the output registers.
7810 for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
7811 CCValAssign &VA = RVLocs[i];
7812 assert(VA.isRegLoc() && "Can only return in registers!");
7813
7814 SDValue Arg = OutVals[RealResIdx];
7815
7816 switch (VA.getLocInfo()) {
7817 default: llvm_unreachable("Unknown loc info!");
7818 case CCValAssign::Full: break;
7819 case CCValAssign::AExt:
7820 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7821 break;
7822 case CCValAssign::ZExt:
7823 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7824 break;
7825 case CCValAssign::SExt:
7826 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7827 break;
7828 }
7829 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
7830 bool isLittleEndian = Subtarget.isLittleEndian();
7831 // Legalize ret f64 -> ret 2 x i32.
7832 SDValue SVal =
7833 DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
7834 N2: DAG.getIntPtrConstant(Val: isLittleEndian ? 0 : 1, DL: dl));
7835 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: SVal, Glue);
7836 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
7837 SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
7838 N2: DAG.getIntPtrConstant(Val: isLittleEndian ? 1 : 0, DL: dl));
7839 Glue = Chain.getValue(R: 1);
7840 VA = RVLocs[++i]; // skip ahead to next loc
7841 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: SVal, Glue);
7842 } else
7843 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: Arg, Glue);
7844 Glue = Chain.getValue(R: 1);
7845 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
7846 }
7847
7848 RetOps[0] = Chain; // Update chain.
7849
7850 // Add the glue if we have it.
7851 if (Glue.getNode())
7852 RetOps.push_back(Elt: Glue);
7853
7854 return DAG.getNode(Opcode: PPCISD::RET_GLUE, DL: dl, VT: MVT::Other, Ops: RetOps);
7855}
7856
7857SDValue
7858PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
7859 SelectionDAG &DAG) const {
7860 SDLoc dl(Op);
7861
7862 // Get the correct type for integers.
7863 EVT IntVT = Op.getValueType();
7864
7865 // Get the inputs.
7866 SDValue Chain = Op.getOperand(i: 0);
7867 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7868 // Build a DYNAREAOFFSET node.
7869 SDValue Ops[2] = {Chain, FPSIdx};
7870 SDVTList VTs = DAG.getVTList(VT: IntVT);
7871 return DAG.getNode(Opcode: PPCISD::DYNAREAOFFSET, DL: dl, VTList: VTs, Ops);
7872}
7873
7874SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
7875 SelectionDAG &DAG) const {
7876 // When we pop the dynamic allocation we need to restore the SP link.
7877 SDLoc dl(Op);
7878
7879 // Get the correct type for pointers.
7880 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7881
7882 // Construct the stack pointer operand.
7883 bool isPPC64 = Subtarget.isPPC64();
7884 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
7885 SDValue StackPtr = DAG.getRegister(Reg: SP, VT: PtrVT);
7886
7887 // Get the operands for the STACKRESTORE.
7888 SDValue Chain = Op.getOperand(i: 0);
7889 SDValue SaveSP = Op.getOperand(i: 1);
7890
7891 // Load the old link SP.
7892 SDValue LoadLinkSP =
7893 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: StackPtr, PtrInfo: MachinePointerInfo());
7894
7895 // Restore the stack pointer.
7896 Chain = DAG.getCopyToReg(Chain: LoadLinkSP.getValue(R: 1), dl, Reg: SP, N: SaveSP);
7897
7898 // Store the old link SP.
7899 return DAG.getStore(Chain, dl, Val: LoadLinkSP, Ptr: StackPtr, PtrInfo: MachinePointerInfo());
7900}
7901
7902SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
7903 MachineFunction &MF = DAG.getMachineFunction();
7904 bool isPPC64 = Subtarget.isPPC64();
7905 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7906
7907 // Get current frame pointer save index. The users of this index will be
7908 // primarily DYNALLOC instructions.
7909 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7910 int RASI = FI->getReturnAddrSaveIndex();
7911
7912 // If the frame pointer save index hasn't been defined yet.
7913 if (!RASI) {
7914 // Find out what the fix offset of the frame pointer save area.
7915 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
7916 // Allocate the frame index for frame pointer save area.
7917 RASI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64? 8 : 4, SPOffset: LROffset, IsImmutable: false);
7918 // Save the result.
7919 FI->setReturnAddrSaveIndex(RASI);
7920 }
7921 return DAG.getFrameIndex(FI: RASI, VT: PtrVT);
7922}
7923
7924SDValue
7925PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
7926 MachineFunction &MF = DAG.getMachineFunction();
7927 bool isPPC64 = Subtarget.isPPC64();
7928 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7929
7930 // Get current frame pointer save index. The users of this index will be
7931 // primarily DYNALLOC instructions.
7932 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7933 int FPSI = FI->getFramePointerSaveIndex();
7934
7935 // If the frame pointer save index hasn't been defined yet.
7936 if (!FPSI) {
7937 // Find out what the fix offset of the frame pointer save area.
7938 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
7939 // Allocate the frame index for frame pointer save area.
7940 FPSI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64? 8 : 4, SPOffset: FPOffset, IsImmutable: true);
7941 // Save the result.
7942 FI->setFramePointerSaveIndex(FPSI);
7943 }
7944 return DAG.getFrameIndex(FI: FPSI, VT: PtrVT);
7945}
7946
7947SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
7948 SelectionDAG &DAG) const {
7949 MachineFunction &MF = DAG.getMachineFunction();
7950 // Get the inputs.
7951 SDValue Chain = Op.getOperand(i: 0);
7952 SDValue Size = Op.getOperand(i: 1);
7953 SDLoc dl(Op);
7954
7955 // Get the correct type for pointers.
7956 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7957 // Negate the size.
7958 SDValue NegSize = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: PtrVT,
7959 N1: DAG.getConstant(Val: 0, DL: dl, VT: PtrVT), N2: Size);
7960 // Construct a node for the frame pointer save index.
7961 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7962 SDValue Ops[3] = { Chain, NegSize, FPSIdx };
7963 SDVTList VTs = DAG.getVTList(VT1: PtrVT, VT2: MVT::Other);
7964 if (hasInlineStackProbe(MF))
7965 return DAG.getNode(Opcode: PPCISD::PROBED_ALLOCA, DL: dl, VTList: VTs, Ops);
7966 return DAG.getNode(Opcode: PPCISD::DYNALLOC, DL: dl, VTList: VTs, Ops);
7967}
7968
7969SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
7970 SelectionDAG &DAG) const {
7971 MachineFunction &MF = DAG.getMachineFunction();
7972
7973 bool isPPC64 = Subtarget.isPPC64();
7974 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7975
7976 int FI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64 ? 8 : 4, SPOffset: 0, IsImmutable: false);
7977 return DAG.getFrameIndex(FI, VT: PtrVT);
7978}
7979
7980SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
7981 SelectionDAG &DAG) const {
7982 SDLoc DL(Op);
7983 return DAG.getNode(Opcode: PPCISD::EH_SJLJ_SETJMP, DL,
7984 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other),
7985 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
7986}
7987
7988SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
7989 SelectionDAG &DAG) const {
7990 SDLoc DL(Op);
7991 return DAG.getNode(Opcode: PPCISD::EH_SJLJ_LONGJMP, DL, VT: MVT::Other,
7992 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
7993}
7994
7995SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
7996 if (Op.getValueType().isVector())
7997 return LowerVectorLoad(Op, DAG);
7998
7999 assert(Op.getValueType() == MVT::i1 &&
8000 "Custom lowering only for i1 loads");
8001
8002 // First, load 8 bits into 32 bits, then truncate to 1 bit.
8003
8004 SDLoc dl(Op);
8005 LoadSDNode *LD = cast<LoadSDNode>(Val&: Op);
8006
8007 SDValue Chain = LD->getChain();
8008 SDValue BasePtr = LD->getBasePtr();
8009 MachineMemOperand *MMO = LD->getMemOperand();
8010
8011 SDValue NewLD =
8012 DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: getPointerTy(DL: DAG.getDataLayout()), Chain,
8013 Ptr: BasePtr, MemVT: MVT::i8, MMO);
8014 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: NewLD);
8015
8016 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
8017 return DAG.getMergeValues(Ops, dl);
8018}
8019
8020SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
8021 if (Op.getOperand(i: 1).getValueType().isVector())
8022 return LowerVectorStore(Op, DAG);
8023
8024 assert(Op.getOperand(1).getValueType() == MVT::i1 &&
8025 "Custom lowering only for i1 stores");
8026
8027 // First, zero extend to 32 bits, then use a truncating store to 8 bits.
8028
8029 SDLoc dl(Op);
8030 StoreSDNode *ST = cast<StoreSDNode>(Val&: Op);
8031
8032 SDValue Chain = ST->getChain();
8033 SDValue BasePtr = ST->getBasePtr();
8034 SDValue Value = ST->getValue();
8035 MachineMemOperand *MMO = ST->getMemOperand();
8036
8037 Value = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
8038 Operand: Value);
8039 return DAG.getTruncStore(Chain, dl, Val: Value, Ptr: BasePtr, SVT: MVT::i8, MMO);
8040}
8041
8042// FIXME: Remove this once the ANDI glue bug is fixed:
8043SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8044 assert(Op.getValueType() == MVT::i1 &&
8045 "Custom lowering only for i1 results");
8046
8047 SDLoc DL(Op);
8048 return DAG.getNode(Opcode: PPCISD::ANDI_rec_1_GT_BIT, DL, VT: MVT::i1, Operand: Op.getOperand(i: 0));
8049}
8050
8051SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
8052 SelectionDAG &DAG) const {
8053
8054 // Implements a vector truncate that fits in a vector register as a shuffle.
8055 // We want to legalize vector truncates down to where the source fits in
8056 // a vector register (and target is therefore smaller than vector register
8057 // size). At that point legalization will try to custom lower the sub-legal
8058 // result and get here - where we can contain the truncate as a single target
8059 // operation.
8060
8061 // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
8062 // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
8063 //
8064 // We will implement it for big-endian ordering as this (where x denotes
8065 // undefined):
8066 // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
8067 // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
8068 //
8069 // The same operation in little-endian ordering will be:
8070 // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
8071 // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
8072
8073 EVT TrgVT = Op.getValueType();
8074 assert(TrgVT.isVector() && "Vector type expected.");
8075 unsigned TrgNumElts = TrgVT.getVectorNumElements();
8076 EVT EltVT = TrgVT.getVectorElementType();
8077 if (!isOperationCustom(Op: Op.getOpcode(), VT: TrgVT) ||
8078 TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(Value: TrgNumElts) ||
8079 !llvm::has_single_bit<uint32_t>(Value: EltVT.getSizeInBits()))
8080 return SDValue();
8081
8082 SDValue N1 = Op.getOperand(i: 0);
8083 EVT SrcVT = N1.getValueType();
8084 unsigned SrcSize = SrcVT.getSizeInBits();
8085 if (SrcSize > 256 || !isPowerOf2_32(Value: SrcVT.getVectorNumElements()) ||
8086 !llvm::has_single_bit<uint32_t>(
8087 Value: SrcVT.getVectorElementType().getSizeInBits()))
8088 return SDValue();
8089 if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
8090 return SDValue();
8091
8092 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8093 EVT WideVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: WideNumElts);
8094
8095 SDLoc DL(Op);
8096 SDValue Op1, Op2;
8097 if (SrcSize == 256) {
8098 EVT VecIdxTy = getVectorIdxTy(DL: DAG.getDataLayout());
8099 EVT SplitVT =
8100 N1.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
8101 unsigned SplitNumElts = SplitVT.getVectorNumElements();
8102 Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SplitVT, N1,
8103 N2: DAG.getConstant(Val: 0, DL, VT: VecIdxTy));
8104 Op2 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SplitVT, N1,
8105 N2: DAG.getConstant(Val: SplitNumElts, DL, VT: VecIdxTy));
8106 }
8107 else {
8108 Op1 = SrcSize == 128 ? N1 : widenVec(DAG, Vec: N1, dl: DL);
8109 Op2 = DAG.getUNDEF(VT: WideVT);
8110 }
8111
8112 // First list the elements we want to keep.
8113 unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
8114 SmallVector<int, 16> ShuffV;
8115 if (Subtarget.isLittleEndian())
8116 for (unsigned i = 0; i < TrgNumElts; ++i)
8117 ShuffV.push_back(Elt: i * SizeMult);
8118 else
8119 for (unsigned i = 1; i <= TrgNumElts; ++i)
8120 ShuffV.push_back(Elt: i * SizeMult - 1);
8121
8122 // Populate the remaining elements with undefs.
8123 for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
8124 // ShuffV.push_back(i + WideNumElts);
8125 ShuffV.push_back(Elt: WideNumElts + 1);
8126
8127 Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVT, Operand: Op1);
8128 Op2 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVT, Operand: Op2);
8129 return DAG.getVectorShuffle(VT: WideVT, dl: DL, N1: Op1, N2: Op2, Mask: ShuffV);
8130}
8131
8132/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
8133/// possible.
8134SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
8135 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
8136 EVT ResVT = Op.getValueType();
8137 EVT CmpVT = Op.getOperand(i: 0).getValueType();
8138 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
8139 SDValue TV = Op.getOperand(i: 2), FV = Op.getOperand(i: 3);
8140 SDLoc dl(Op);
8141
8142 // Without power9-vector, we don't have native instruction for f128 comparison.
8143 // Following transformation to libcall is needed for setcc:
8144 // select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE
8145 if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {
8146 SDValue Z = DAG.getSetCC(
8147 DL: dl, VT: getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: CmpVT),
8148 LHS, RHS, Cond: CC);
8149 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: Z.getValueType());
8150 return DAG.getSelectCC(DL: dl, LHS: Z, RHS: Zero, True: TV, False: FV, Cond: ISD::SETNE);
8151 }
8152
8153 // Not FP, or using SPE? Not a fsel.
8154 if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||
8155 Subtarget.hasSPE())
8156 return Op;
8157
8158 SDNodeFlags Flags = Op.getNode()->getFlags();
8159
8160 // We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the
8161 // presence of infinities.
8162 if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
8163 switch (CC) {
8164 default:
8165 break;
8166 case ISD::SETOGT:
8167 case ISD::SETGT:
8168 return DAG.getNode(Opcode: PPCISD::XSMAXC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
8169 case ISD::SETOLT:
8170 case ISD::SETLT:
8171 return DAG.getNode(Opcode: PPCISD::XSMINC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
8172 }
8173 }
8174
8175 // We might be able to do better than this under some circumstances, but in
8176 // general, fsel-based lowering of select is a finite-math-only optimization.
8177 // For more information, see section F.3 of the 2.06 ISA specification.
8178 // With ISA 3.0
8179 if (!Flags.hasNoInfs() ||
8180 (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()) ||
8181 ResVT == MVT::f128)
8182 return Op;
8183
8184 // If the RHS of the comparison is a 0.0, we don't need to do the
8185 // subtraction at all.
8186 SDValue Sel1;
8187 if (isFloatingPointZero(Op: RHS))
8188 switch (CC) {
8189 default: break; // SETUO etc aren't handled by fsel.
8190 case ISD::SETNE:
8191 std::swap(a&: TV, b&: FV);
8192 [[fallthrough]];
8193 case ISD::SETEQ:
8194 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8195 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8196 Sel1 = DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: LHS, N2: TV, N3: FV);
8197 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8198 Sel1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Sel1);
8199 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8200 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: LHS), N2: Sel1, N3: FV);
8201 case ISD::SETULT:
8202 case ISD::SETLT:
8203 std::swap(a&: TV, b&: FV); // fsel is natively setge, swap operands for setlt
8204 [[fallthrough]];
8205 case ISD::SETOGE:
8206 case ISD::SETGE:
8207 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8208 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8209 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: LHS, N2: TV, N3: FV);
8210 case ISD::SETUGT:
8211 case ISD::SETGT:
8212 std::swap(a&: TV, b&: FV); // fsel is natively setge, swap operands for setlt
8213 [[fallthrough]];
8214 case ISD::SETOLE:
8215 case ISD::SETLE:
8216 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8217 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8218 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8219 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: LHS), N2: TV, N3: FV);
8220 }
8221
8222 SDValue Cmp;
8223 switch (CC) {
8224 default: break; // SETUO etc aren't handled by fsel.
8225 case ISD::SETNE:
8226 std::swap(a&: TV, b&: FV);
8227 [[fallthrough]];
8228 case ISD::SETEQ:
8229 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8230 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8231 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8232 Sel1 = DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8233 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8234 Sel1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Sel1);
8235 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8236 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: Cmp), N2: Sel1, N3: FV);
8237 case ISD::SETULT:
8238 case ISD::SETLT:
8239 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8240 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8241 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8242 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: FV, N3: TV);
8243 case ISD::SETOGE:
8244 case ISD::SETGE:
8245 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8246 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8247 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8248 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8249 case ISD::SETUGT:
8250 case ISD::SETGT:
8251 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: RHS, N2: LHS, Flags);
8252 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8253 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8254 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: FV, N3: TV);
8255 case ISD::SETOLE:
8256 case ISD::SETLE:
8257 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: RHS, N2: LHS, Flags);
8258 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8259 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8260 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8261 }
8262 return Op;
8263}
8264
8265static unsigned getPPCStrictOpcode(unsigned Opc) {
8266 switch (Opc) {
8267 default:
8268 llvm_unreachable("No strict version of this opcode!");
8269 case PPCISD::FCTIDZ:
8270 return PPCISD::STRICT_FCTIDZ;
8271 case PPCISD::FCTIWZ:
8272 return PPCISD::STRICT_FCTIWZ;
8273 case PPCISD::FCTIDUZ:
8274 return PPCISD::STRICT_FCTIDUZ;
8275 case PPCISD::FCTIWUZ:
8276 return PPCISD::STRICT_FCTIWUZ;
8277 case PPCISD::FCFID:
8278 return PPCISD::STRICT_FCFID;
8279 case PPCISD::FCFIDU:
8280 return PPCISD::STRICT_FCFIDU;
8281 case PPCISD::FCFIDS:
8282 return PPCISD::STRICT_FCFIDS;
8283 case PPCISD::FCFIDUS:
8284 return PPCISD::STRICT_FCFIDUS;
8285 }
8286}
8287
8288static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
8289 const PPCSubtarget &Subtarget) {
8290 SDLoc dl(Op);
8291 bool IsStrict = Op->isStrictFPOpcode();
8292 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8293 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8294
8295 // TODO: Any other flags to propagate?
8296 SDNodeFlags Flags;
8297 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8298
8299 // For strict nodes, source is the second operand.
8300 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8301 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
8302 MVT DestTy = Op.getSimpleValueType();
8303 assert(Src.getValueType().isFloatingPoint() &&
8304 (DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 ||
8305 DestTy == MVT::i64) &&
8306 "Invalid FP_TO_INT types");
8307 if (Src.getValueType() == MVT::f32) {
8308 if (IsStrict) {
8309 Src =
8310 DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl,
8311 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other), Ops: {Chain, Src}, Flags);
8312 Chain = Src.getValue(R: 1);
8313 } else
8314 Src = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Src);
8315 }
8316 if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector())
8317 DestTy = Subtarget.getScalarIntVT();
8318 unsigned Opc = ISD::DELETED_NODE;
8319 switch (DestTy.SimpleTy) {
8320 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
8321 case MVT::i32:
8322 Opc = IsSigned ? PPCISD::FCTIWZ
8323 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
8324 break;
8325 case MVT::i64:
8326 assert((IsSigned || Subtarget.hasFPCVT()) &&
8327 "i64 FP_TO_UINT is supported only with FPCVT");
8328 Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
8329 }
8330 EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64;
8331 SDValue Conv;
8332 if (IsStrict) {
8333 Opc = getPPCStrictOpcode(Opc);
8334 Conv = DAG.getNode(Opcode: Opc, DL: dl, VTList: DAG.getVTList(VT1: ConvTy, VT2: MVT::Other), Ops: {Chain, Src},
8335 Flags);
8336 } else {
8337 Conv = DAG.getNode(Opcode: Opc, DL: dl, VT: ConvTy, Operand: Src);
8338 }
8339 return Conv;
8340}
8341
8342void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
8343 SelectionDAG &DAG,
8344 const SDLoc &dl) const {
8345 SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
8346 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8347 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8348 bool IsStrict = Op->isStrictFPOpcode();
8349
8350 // Convert the FP value to an int value through memory.
8351 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
8352 (IsSigned || Subtarget.hasFPCVT());
8353 SDValue FIPtr = DAG.CreateStackTemporary(VT: i32Stack ? MVT::i32 : MVT::f64);
8354 int FI = cast<FrameIndexSDNode>(Val&: FIPtr)->getIndex();
8355 MachinePointerInfo MPI =
8356 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI);
8357
8358 // Emit a store to the stack slot.
8359 SDValue Chain = IsStrict ? Tmp.getValue(R: 1) : DAG.getEntryNode();
8360 Align Alignment(DAG.getEVTAlign(MemoryVT: Tmp.getValueType()));
8361 if (i32Stack) {
8362 MachineFunction &MF = DAG.getMachineFunction();
8363 Alignment = Align(4);
8364 MachineMemOperand *MMO =
8365 MF.getMachineMemOperand(PtrInfo: MPI, F: MachineMemOperand::MOStore, Size: 4, BaseAlignment: Alignment);
8366 SDValue Ops[] = { Chain, Tmp, FIPtr };
8367 Chain = DAG.getMemIntrinsicNode(Opcode: PPCISD::STFIWX, dl,
8368 VTList: DAG.getVTList(VT: MVT::Other), Ops, MemVT: MVT::i32, MMO);
8369 } else
8370 Chain = DAG.getStore(Chain, dl, Val: Tmp, Ptr: FIPtr, PtrInfo: MPI, Alignment);
8371
8372 // Result is a load from the stack slot. If loading 4 bytes, make sure to
8373 // add in a bias on big endian.
8374 if (Op.getValueType() == MVT::i32 && !i32Stack &&
8375 !Subtarget.isLittleEndian()) {
8376 FIPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: FIPtr.getValueType(), N1: FIPtr,
8377 N2: DAG.getConstant(Val: 4, DL: dl, VT: FIPtr.getValueType()));
8378 MPI = MPI.getWithOffset(O: 4);
8379 }
8380
8381 RLI.Chain = Chain;
8382 RLI.Ptr = FIPtr;
8383 RLI.MPI = MPI;
8384 RLI.Alignment = Alignment;
8385}
8386
8387/// Custom lowers floating point to integer conversions to use
8388/// the direct move instructions available in ISA 2.07 to avoid the
8389/// need for load/store combinations.
8390SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
8391 SelectionDAG &DAG,
8392 const SDLoc &dl) const {
8393 SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
8394 SDValue Mov = DAG.getNode(Opcode: PPCISD::MFVSR, DL: dl, VT: Op.getValueType(), Operand: Conv);
8395 if (Op->isStrictFPOpcode())
8396 return DAG.getMergeValues(Ops: {Mov, Conv.getValue(R: 1)}, dl);
8397 else
8398 return Mov;
8399}
8400
8401SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
8402 const SDLoc &dl) const {
8403 bool IsStrict = Op->isStrictFPOpcode();
8404 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8405 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8406 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8407 EVT SrcVT = Src.getValueType();
8408 EVT DstVT = Op.getValueType();
8409
8410 // FP to INT conversions are legal for f128.
8411 if (SrcVT == MVT::f128)
8412 return Subtarget.hasP9Vector() ? Op : SDValue();
8413
8414 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
8415 // PPC (the libcall is not available).
8416 if (SrcVT == MVT::ppcf128) {
8417 if (DstVT == MVT::i32) {
8418 // TODO: Conservatively pass only nofpexcept flag here. Need to check and
8419 // set other fast-math flags to FP operations in both strict and
8420 // non-strict cases. (FP_TO_SINT, FSUB)
8421 SDNodeFlags Flags;
8422 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8423
8424 if (IsSigned) {
8425 SDValue Lo, Hi;
8426 std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Src, DL: dl, LoVT: MVT::f64, HiVT: MVT::f64);
8427
8428 // Add the two halves of the long double in round-to-zero mode, and use
8429 // a smaller FP_TO_SINT.
8430 if (IsStrict) {
8431 SDValue Res = DAG.getNode(Opcode: PPCISD::STRICT_FADDRTZ, DL: dl,
8432 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8433 Ops: {Op.getOperand(i: 0), Lo, Hi}, Flags);
8434 return DAG.getNode(Opcode: ISD::STRICT_FP_TO_SINT, DL: dl,
8435 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other),
8436 Ops: {Res.getValue(R: 1), Res}, Flags);
8437 } else {
8438 SDValue Res = DAG.getNode(Opcode: PPCISD::FADDRTZ, DL: dl, VT: MVT::f64, N1: Lo, N2: Hi);
8439 return DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: Res);
8440 }
8441 } else {
8442 const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
8443 APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
8444 SDValue Cst = DAG.getConstantFP(Val: APF, DL: dl, VT: SrcVT);
8445 SDValue SignMask = DAG.getConstant(Val: 0x80000000, DL: dl, VT: DstVT);
8446 if (IsStrict) {
8447 // Sel = Src < 0x80000000
8448 // FltOfs = select Sel, 0.0, 0x80000000
8449 // IntOfs = select Sel, 0, 0x80000000
8450 // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
8451 SDValue Chain = Op.getOperand(i: 0);
8452 EVT SetCCVT =
8453 getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT);
8454 EVT DstSetCCVT =
8455 getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: DstVT);
8456 SDValue Sel = DAG.getSetCC(DL: dl, VT: SetCCVT, LHS: Src, RHS: Cst, Cond: ISD::SETLT,
8457 Chain, IsSignaling: true);
8458 Chain = Sel.getValue(R: 1);
8459
8460 SDValue FltOfs = DAG.getSelect(
8461 DL: dl, VT: SrcVT, Cond: Sel, LHS: DAG.getConstantFP(Val: 0.0, DL: dl, VT: SrcVT), RHS: Cst);
8462 Sel = DAG.getBoolExtOrTrunc(Op: Sel, SL: dl, VT: DstSetCCVT, OpVT: DstVT);
8463
8464 SDValue Val = DAG.getNode(Opcode: ISD::STRICT_FSUB, DL: dl,
8465 VTList: DAG.getVTList(VT1: SrcVT, VT2: MVT::Other),
8466 Ops: {Chain, Src, FltOfs}, Flags);
8467 Chain = Val.getValue(R: 1);
8468 SDValue SInt = DAG.getNode(Opcode: ISD::STRICT_FP_TO_SINT, DL: dl,
8469 VTList: DAG.getVTList(VT1: DstVT, VT2: MVT::Other),
8470 Ops: {Chain, Val}, Flags);
8471 Chain = SInt.getValue(R: 1);
8472 SDValue IntOfs = DAG.getSelect(
8473 DL: dl, VT: DstVT, Cond: Sel, LHS: DAG.getConstant(Val: 0, DL: dl, VT: DstVT), RHS: SignMask);
8474 SDValue Result = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: DstVT, N1: SInt, N2: IntOfs);
8475 return DAG.getMergeValues(Ops: {Result, Chain}, dl);
8476 } else {
8477 // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
8478 // FIXME: generated code sucks.
8479 SDValue True = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: MVT::ppcf128, N1: Src, N2: Cst);
8480 True = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: True);
8481 True = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: True, N2: SignMask);
8482 SDValue False = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: Src);
8483 return DAG.getSelectCC(DL: dl, LHS: Src, RHS: Cst, True, False, Cond: ISD::SETGE);
8484 }
8485 }
8486 }
8487
8488 return SDValue();
8489 }
8490
8491 if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
8492 return LowerFP_TO_INTDirectMove(Op, DAG, dl);
8493
8494 ReuseLoadInfo RLI;
8495 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8496
8497 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: RLI.Chain, Ptr: RLI.Ptr, PtrInfo: RLI.MPI,
8498 Alignment: RLI.Alignment, MMOFlags: RLI.MMOFlags(), AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8499}
8500
8501// We're trying to insert a regular store, S, and then a load, L. If the
8502// incoming value, O, is a load, we might just be able to have our load use the
8503// address used by O. However, we don't know if anything else will store to
8504// that address before we can load from it. To prevent this situation, we need
8505// to insert our load, L, into the chain as a peer of O. To do this, we give L
8506// the same chain operand as O, we create a token factor from the chain results
8507// of O and L, and we replace all uses of O's chain result with that token
8508// factor (this last part is handled by makeEquivalentMemoryOrdering).
8509bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
8510 ReuseLoadInfo &RLI,
8511 SelectionDAG &DAG,
8512 ISD::LoadExtType ET) const {
8513 // Conservatively skip reusing for constrained FP nodes.
8514 if (Op->isStrictFPOpcode())
8515 return false;
8516
8517 SDLoc dl(Op);
8518 bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
8519 (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
8520 if (ET == ISD::NON_EXTLOAD &&
8521 (ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
8522 isOperationLegalOrCustom(Op: Op.getOpcode(),
8523 VT: Op.getOperand(i: 0).getValueType())) {
8524
8525 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8526 return true;
8527 }
8528
8529 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: Op);
8530 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
8531 LD->isNonTemporal())
8532 return false;
8533 if (LD->getMemoryVT() != MemVT)
8534 return false;
8535
8536 // If the result of the load is an illegal type, then we can't build a
8537 // valid chain for reuse since the legalised loads and token factor node that
8538 // ties the legalised loads together uses a different output chain then the
8539 // illegal load.
8540 if (!isTypeLegal(VT: LD->getValueType(ResNo: 0)))
8541 return false;
8542
8543 RLI.Ptr = LD->getBasePtr();
8544 if (LD->isIndexed() && !LD->getOffset().isUndef()) {
8545 assert(LD->getAddressingMode() == ISD::PRE_INC &&
8546 "Non-pre-inc AM on PPC?");
8547 RLI.Ptr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RLI.Ptr.getValueType(), N1: RLI.Ptr,
8548 N2: LD->getOffset());
8549 }
8550
8551 RLI.Chain = LD->getChain();
8552 RLI.MPI = LD->getPointerInfo();
8553 RLI.IsDereferenceable = LD->isDereferenceable();
8554 RLI.IsInvariant = LD->isInvariant();
8555 RLI.Alignment = LD->getAlign();
8556 RLI.AAInfo = LD->getAAInfo();
8557 RLI.Ranges = LD->getRanges();
8558
8559 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
8560 return true;
8561}
8562
8563/// Analyze profitability of direct move
8564/// prefer float load to int load plus direct move
8565/// when there is no integer use of int load
8566bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
8567 SDNode *Origin = Op.getOperand(i: Op->isStrictFPOpcode() ? 1 : 0).getNode();
8568 if (Origin->getOpcode() != ISD::LOAD)
8569 return true;
8570
8571 // If there is no LXSIBZX/LXSIHZX, like Power8,
8572 // prefer direct move if the memory size is 1 or 2 bytes.
8573 MachineMemOperand *MMO = cast<LoadSDNode>(Val: Origin)->getMemOperand();
8574 if (!Subtarget.hasP9Vector() &&
8575 (!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
8576 return true;
8577
8578 for (SDUse &Use : Origin->uses()) {
8579
8580 // Only look at the users of the loaded value.
8581 if (Use.getResNo() != 0)
8582 continue;
8583
8584 SDNode *User = Use.getUser();
8585 if (User->getOpcode() != ISD::SINT_TO_FP &&
8586 User->getOpcode() != ISD::UINT_TO_FP &&
8587 User->getOpcode() != ISD::STRICT_SINT_TO_FP &&
8588 User->getOpcode() != ISD::STRICT_UINT_TO_FP)
8589 return true;
8590 }
8591
8592 return false;
8593}
8594
8595static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
8596 const PPCSubtarget &Subtarget,
8597 SDValue Chain = SDValue()) {
8598 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8599 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8600 SDLoc dl(Op);
8601
8602 // TODO: Any other flags to propagate?
8603 SDNodeFlags Flags;
8604 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8605
8606 // If we have FCFIDS, then use it when converting to single-precision.
8607 // Otherwise, convert to double-precision and then round.
8608 bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
8609 unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
8610 : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
8611 EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
8612 if (Op->isStrictFPOpcode()) {
8613 if (!Chain)
8614 Chain = Op.getOperand(i: 0);
8615 return DAG.getNode(Opcode: getPPCStrictOpcode(Opc: ConvOpc), DL: dl,
8616 VTList: DAG.getVTList(VT1: ConvTy, VT2: MVT::Other), Ops: {Chain, Src}, Flags);
8617 } else
8618 return DAG.getNode(Opcode: ConvOpc, DL: dl, VT: ConvTy, Operand: Src);
8619}
8620
8621/// Custom lowers integer to floating point conversions to use
8622/// the direct move instructions available in ISA 2.07 to avoid the
8623/// need for load/store combinations.
8624SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
8625 SelectionDAG &DAG,
8626 const SDLoc &dl) const {
8627 assert((Op.getValueType() == MVT::f32 ||
8628 Op.getValueType() == MVT::f64) &&
8629 "Invalid floating point type as target of conversion");
8630 assert(Subtarget.hasFPCVT() &&
8631 "Int to FP conversions with direct moves require FPCVT");
8632 SDValue Src = Op.getOperand(i: Op->isStrictFPOpcode() ? 1 : 0);
8633 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
8634 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
8635 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8636 unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
8637 SDValue Mov = DAG.getNode(Opcode: MovOpc, DL: dl, VT: MVT::f64, Operand: Src);
8638 return convertIntToFP(Op, Src: Mov, DAG, Subtarget);
8639}
8640
8641static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
8642
8643 EVT VecVT = Vec.getValueType();
8644 assert(VecVT.isVector() && "Expected a vector type.");
8645 assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
8646
8647 EVT EltVT = VecVT.getVectorElementType();
8648 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8649 EVT WideVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: WideNumElts);
8650
8651 unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
8652 SmallVector<SDValue, 16> Ops(NumConcat);
8653 Ops[0] = Vec;
8654 SDValue UndefVec = DAG.getUNDEF(VT: VecVT);
8655 for (unsigned i = 1; i < NumConcat; ++i)
8656 Ops[i] = UndefVec;
8657
8658 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: WideVT, Ops);
8659}
8660
8661SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
8662 const SDLoc &dl) const {
8663 bool IsStrict = Op->isStrictFPOpcode();
8664 unsigned Opc = Op.getOpcode();
8665 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8666 assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
8667 Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
8668 "Unexpected conversion type");
8669 assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
8670 "Supports conversions to v2f64/v4f32 only.");
8671
8672 // TODO: Any other flags to propagate?
8673 SDNodeFlags Flags;
8674 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8675
8676 bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
8677 bool FourEltRes = Op.getValueType() == MVT::v4f32;
8678
8679 SDValue Wide = widenVec(DAG, Vec: Src, dl);
8680 EVT WideVT = Wide.getValueType();
8681 unsigned WideNumElts = WideVT.getVectorNumElements();
8682 MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
8683
8684 SmallVector<int, 16> ShuffV;
8685 for (unsigned i = 0; i < WideNumElts; ++i)
8686 ShuffV.push_back(Elt: i + WideNumElts);
8687
8688 int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
8689 int SaveElts = FourEltRes ? 4 : 2;
8690 if (Subtarget.isLittleEndian())
8691 for (int i = 0; i < SaveElts; i++)
8692 ShuffV[i * Stride] = i;
8693 else
8694 for (int i = 1; i <= SaveElts; i++)
8695 ShuffV[i * Stride - 1] = i - 1;
8696
8697 SDValue ShuffleSrc2 =
8698 SignedConv ? DAG.getUNDEF(VT: WideVT) : DAG.getConstant(Val: 0, DL: dl, VT: WideVT);
8699 SDValue Arrange = DAG.getVectorShuffle(VT: WideVT, dl, N1: Wide, N2: ShuffleSrc2, Mask: ShuffV);
8700
8701 SDValue Extend;
8702 if (SignedConv) {
8703 Arrange = DAG.getBitcast(VT: IntermediateVT, V: Arrange);
8704 EVT ExtVT = Src.getValueType();
8705 if (Subtarget.hasP9Altivec())
8706 ExtVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: WideVT.getVectorElementType(),
8707 NumElements: IntermediateVT.getVectorNumElements());
8708
8709 Extend = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT: IntermediateVT, N1: Arrange,
8710 N2: DAG.getValueType(ExtVT));
8711 } else
8712 Extend = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntermediateVT, Operand: Arrange);
8713
8714 if (IsStrict)
8715 return DAG.getNode(Opcode: Opc, DL: dl, VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::Other),
8716 Ops: {Op.getOperand(i: 0), Extend}, Flags);
8717
8718 return DAG.getNode(Opcode: Opc, DL: dl, VT: Op.getValueType(), Operand: Extend);
8719}
8720
8721SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
8722 SelectionDAG &DAG) const {
8723 SDLoc dl(Op);
8724 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8725 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8726 bool IsStrict = Op->isStrictFPOpcode();
8727 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8728 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : DAG.getEntryNode();
8729
8730 // TODO: Any other flags to propagate?
8731 SDNodeFlags Flags;
8732 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8733
8734 EVT InVT = Src.getValueType();
8735 EVT OutVT = Op.getValueType();
8736 if (OutVT.isVector() && OutVT.isFloatingPoint() &&
8737 isOperationCustom(Op: Op.getOpcode(), VT: InVT))
8738 return LowerINT_TO_FPVector(Op, DAG, dl);
8739
8740 // Conversions to f128 are legal.
8741 if (Op.getValueType() == MVT::f128)
8742 return Subtarget.hasP9Vector() ? Op : SDValue();
8743
8744 // Don't handle ppc_fp128 here; let it be lowered to a libcall.
8745 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
8746 return SDValue();
8747
8748 if (Src.getValueType() == MVT::i1) {
8749 SDValue Sel = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: Op.getValueType(), N1: Src,
8750 N2: DAG.getConstantFP(Val: 1.0, DL: dl, VT: Op.getValueType()),
8751 N3: DAG.getConstantFP(Val: 0.0, DL: dl, VT: Op.getValueType()));
8752 if (IsStrict)
8753 return DAG.getMergeValues(Ops: {Sel, Chain}, dl);
8754 else
8755 return Sel;
8756 }
8757
8758 // If we have direct moves, we can do all the conversion, skip the store/load
8759 // however, without FPCVT we can't do most conversions.
8760 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
8761 Subtarget.isPPC64() && Subtarget.hasFPCVT())
8762 return LowerINT_TO_FPDirectMove(Op, DAG, dl);
8763
8764 assert((IsSigned || Subtarget.hasFPCVT()) &&
8765 "UINT_TO_FP is supported only with FPCVT");
8766
8767 if (Src.getValueType() == MVT::i64) {
8768 SDValue SINT = Src;
8769 // When converting to single-precision, we actually need to convert
8770 // to double-precision first and then round to single-precision.
8771 // To avoid double-rounding effects during that operation, we have
8772 // to prepare the input operand. Bits that might be truncated when
8773 // converting to double-precision are replaced by a bit that won't
8774 // be lost at this stage, but is below the single-precision rounding
8775 // position.
8776 //
8777 // However, if afn is in effect, accept double
8778 // rounding to avoid the extra overhead.
8779 // FIXME: Currently INT_TO_FP can't support fast math flags because
8780 // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
8781 // false.
8782 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
8783 !Op->getFlags().hasApproximateFuncs()) {
8784
8785 // Twiddle input to make sure the low 11 bits are zero. (If this
8786 // is the case, we are guaranteed the value will fit into the 53 bit
8787 // mantissa of an IEEE double-precision value without rounding.)
8788 // If any of those low 11 bits were not zero originally, make sure
8789 // bit 12 (value 2048) is set instead, so that the final rounding
8790 // to single-precision gets the correct result.
8791 SDValue Round = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i64,
8792 N1: SINT, N2: DAG.getConstant(Val: 2047, DL: dl, VT: MVT::i64));
8793 Round = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i64,
8794 N1: Round, N2: DAG.getConstant(Val: 2047, DL: dl, VT: MVT::i64));
8795 Round = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MVT::i64, N1: Round, N2: SINT);
8796 Round = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i64, N1: Round,
8797 N2: DAG.getSignedConstant(Val: -2048, DL: dl, VT: MVT::i64));
8798
8799 // However, we cannot use that value unconditionally: if the magnitude
8800 // of the input value is small, the bit-twiddling we did above might
8801 // end up visibly changing the output. Fortunately, in that case, we
8802 // don't need to twiddle bits since the original input will convert
8803 // exactly to double-precision floating-point already. Therefore,
8804 // construct a conditional to use the original value if the top 11
8805 // bits are all sign-bit copies, and use the rounded value computed
8806 // above otherwise.
8807 SDValue Cond = DAG.getNode(Opcode: ISD::SRA, DL: dl, VT: MVT::i64,
8808 N1: SINT, N2: DAG.getConstant(Val: 53, DL: dl, VT: MVT::i32));
8809 Cond = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i64,
8810 N1: Cond, N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64));
8811 Cond = DAG.getSetCC(
8812 DL: dl,
8813 VT: getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
8814 LHS: Cond, RHS: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64), Cond: ISD::SETUGT);
8815
8816 SINT = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i64, N1: Cond, N2: Round, N3: SINT);
8817 }
8818
8819 ReuseLoadInfo RLI;
8820 SDValue Bits;
8821
8822 MachineFunction &MF = DAG.getMachineFunction();
8823 if (canReuseLoadAddress(Op: SINT, MemVT: MVT::i64, RLI, DAG)) {
8824 Bits = DAG.getLoad(VT: MVT::f64, dl, Chain: RLI.Chain, Ptr: RLI.Ptr, PtrInfo: RLI.MPI,
8825 Alignment: RLI.Alignment, MMOFlags: RLI.MMOFlags(), AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8826 if (RLI.ResChain)
8827 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8828 } else if (Subtarget.hasLFIWAX() &&
8829 canReuseLoadAddress(Op: SINT, MemVT: MVT::i32, RLI, DAG, ET: ISD::SEXTLOAD)) {
8830 MachineMemOperand *MMO =
8831 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8832 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8833 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8834 Bits = DAG.getMemIntrinsicNode(Opcode: PPCISD::LFIWAX, dl,
8835 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8836 Ops, MemVT: MVT::i32, MMO);
8837 if (RLI.ResChain)
8838 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8839 } else if (Subtarget.hasFPCVT() &&
8840 canReuseLoadAddress(Op: SINT, MemVT: MVT::i32, RLI, DAG, ET: ISD::ZEXTLOAD)) {
8841 MachineMemOperand *MMO =
8842 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8843 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8844 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8845 Bits = DAG.getMemIntrinsicNode(Opcode: PPCISD::LFIWZX, dl,
8846 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8847 Ops, MemVT: MVT::i32, MMO);
8848 if (RLI.ResChain)
8849 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8850 } else if (((Subtarget.hasLFIWAX() &&
8851 SINT.getOpcode() == ISD::SIGN_EXTEND) ||
8852 (Subtarget.hasFPCVT() &&
8853 SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
8854 SINT.getOperand(i: 0).getValueType() == MVT::i32) {
8855 MachineFrameInfo &MFI = MF.getFrameInfo();
8856 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8857
8858 int FrameIdx = MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false);
8859 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8860
8861 SDValue Store = DAG.getStore(Chain, dl, Val: SINT.getOperand(i: 0), Ptr: FIdx,
8862 PtrInfo: MachinePointerInfo::getFixedStack(
8863 MF&: DAG.getMachineFunction(), FI: FrameIdx));
8864 Chain = Store;
8865
8866 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8867 "Expected an i32 store");
8868
8869 RLI.Ptr = FIdx;
8870 RLI.Chain = Chain;
8871 RLI.MPI =
8872 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx);
8873 RLI.Alignment = Align(4);
8874
8875 MachineMemOperand *MMO =
8876 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8877 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8878 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8879 Bits = DAG.getMemIntrinsicNode(Opcode: SINT.getOpcode() == ISD::ZERO_EXTEND ?
8880 PPCISD::LFIWZX : PPCISD::LFIWAX,
8881 dl, VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8882 Ops, MemVT: MVT::i32, MMO);
8883 Chain = Bits.getValue(R: 1);
8884 } else
8885 Bits = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::f64, Operand: SINT);
8886
8887 SDValue FP = convertIntToFP(Op, Src: Bits, DAG, Subtarget, Chain);
8888 if (IsStrict)
8889 Chain = FP.getValue(R: 1);
8890
8891 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8892 if (IsStrict)
8893 FP = DAG.getNode(
8894 Opcode: ISD::STRICT_FP_ROUND, DL: dl, VTList: DAG.getVTList(VT1: MVT::f32, VT2: MVT::Other),
8895 Ops: {Chain, FP, DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true)},
8896 Flags);
8897 else
8898 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
8899 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
8900 }
8901 return FP;
8902 }
8903
8904 assert(Src.getValueType() == MVT::i32 &&
8905 "Unhandled INT_TO_FP type in custom expander!");
8906 // Since we only generate this in 64-bit mode, we can take advantage of
8907 // 64-bit registers. In particular, sign extend the input value into the
8908 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
8909 // then lfd it and fcfid it.
8910 MachineFunction &MF = DAG.getMachineFunction();
8911 MachineFrameInfo &MFI = MF.getFrameInfo();
8912 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
8913
8914 SDValue Ld;
8915 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
8916 ReuseLoadInfo RLI;
8917 bool ReusingLoad;
8918 if (!(ReusingLoad = canReuseLoadAddress(Op: Src, MemVT: MVT::i32, RLI, DAG))) {
8919 int FrameIdx = MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false);
8920 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8921
8922 SDValue Store = DAG.getStore(Chain, dl, Val: Src, Ptr: FIdx,
8923 PtrInfo: MachinePointerInfo::getFixedStack(
8924 MF&: DAG.getMachineFunction(), FI: FrameIdx));
8925 Chain = Store;
8926
8927 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8928 "Expected an i32 store");
8929
8930 RLI.Ptr = FIdx;
8931 RLI.Chain = Chain;
8932 RLI.MPI =
8933 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx);
8934 RLI.Alignment = Align(4);
8935 }
8936
8937 MachineMemOperand *MMO =
8938 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8939 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8940 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8941 Ld = DAG.getMemIntrinsicNode(Opcode: IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
8942 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other), Ops,
8943 MemVT: MVT::i32, MMO);
8944 Chain = Ld.getValue(R: 1);
8945 if (ReusingLoad && RLI.ResChain) {
8946 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Ld.getValue(R: 1));
8947 }
8948 } else {
8949 assert(Subtarget.isPPC64() &&
8950 "i32->FP without LFIWAX supported only on PPC64");
8951
8952 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
8953 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8954
8955 SDValue Ext64 = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: MVT::i64, Operand: Src);
8956
8957 // STD the extended value into the stack slot.
8958 SDValue Store = DAG.getStore(
8959 Chain, dl, Val: Ext64, Ptr: FIdx,
8960 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx));
8961 Chain = Store;
8962
8963 // Load the value as a double.
8964 Ld = DAG.getLoad(
8965 VT: MVT::f64, dl, Chain, Ptr: FIdx,
8966 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx));
8967 Chain = Ld.getValue(R: 1);
8968 }
8969
8970 // FCFID it and return it.
8971 SDValue FP = convertIntToFP(Op, Src: Ld, DAG, Subtarget, Chain);
8972 if (IsStrict)
8973 Chain = FP.getValue(R: 1);
8974 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8975 if (IsStrict)
8976 FP = DAG.getNode(
8977 Opcode: ISD::STRICT_FP_ROUND, DL: dl, VTList: DAG.getVTList(VT1: MVT::f32, VT2: MVT::Other),
8978 Ops: {Chain, FP, DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true)}, Flags);
8979 else
8980 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
8981 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
8982 }
8983 return FP;
8984}
8985
8986SDValue PPCTargetLowering::LowerSET_ROUNDING(SDValue Op,
8987 SelectionDAG &DAG) const {
8988 SDLoc Dl(Op);
8989 MachineFunction &MF = DAG.getMachineFunction();
8990 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
8991 SDValue Chain = Op.getOperand(i: 0);
8992
8993 // If requested mode is constant, just use simpler mtfsb/mffscrni
8994 if (auto *CVal = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
8995 uint64_t Mode = CVal->getZExtValue();
8996 assert(Mode < 4 && "Unsupported rounding mode!");
8997 unsigned InternalRnd = Mode ^ (~(Mode >> 1) & 1);
8998 if (Subtarget.isISA3_0())
8999 return SDValue(
9000 DAG.getMachineNode(
9001 Opcode: PPC::MFFSCRNI, dl: Dl, ResultTys: {MVT::f64, MVT::Other},
9002 Ops: {DAG.getConstant(Val: InternalRnd, DL: Dl, VT: MVT::i32, isTarget: true), Chain}),
9003 1);
9004 SDNode *SetHi = DAG.getMachineNode(
9005 Opcode: (InternalRnd & 2) ? PPC::MTFSB1 : PPC::MTFSB0, dl: Dl, VT: MVT::Other,
9006 Ops: {DAG.getConstant(Val: 30, DL: Dl, VT: MVT::i32, isTarget: true), Chain});
9007 SDNode *SetLo = DAG.getMachineNode(
9008 Opcode: (InternalRnd & 1) ? PPC::MTFSB1 : PPC::MTFSB0, dl: Dl, VT: MVT::Other,
9009 Ops: {DAG.getConstant(Val: 31, DL: Dl, VT: MVT::i32, isTarget: true), SDValue(SetHi, 0)});
9010 return SDValue(SetLo, 0);
9011 }
9012
9013 // Use x ^ (~(x >> 1) & 1) to transform LLVM rounding mode to Power format.
9014 SDValue One = DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32);
9015 SDValue SrcFlag = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32, N1: Op.getOperand(i: 1),
9016 N2: DAG.getConstant(Val: 3, DL: Dl, VT: MVT::i32));
9017 SDValue DstFlag = DAG.getNode(
9018 Opcode: ISD::XOR, DL: Dl, VT: MVT::i32, N1: SrcFlag,
9019 N2: DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32,
9020 N1: DAG.getNOT(DL: Dl,
9021 Val: DAG.getNode(Opcode: ISD::SRL, DL: Dl, VT: MVT::i32, N1: SrcFlag, N2: One),
9022 VT: MVT::i32),
9023 N2: One));
9024 // For Power9, there's faster mffscrn, and we don't need to read FPSCR
9025 SDValue MFFS;
9026 if (!Subtarget.isISA3_0()) {
9027 MFFS = DAG.getNode(Opcode: PPCISD::MFFS, DL: Dl, ResultTys: {MVT::f64, MVT::Other}, Ops: Chain);
9028 Chain = MFFS.getValue(R: 1);
9029 }
9030 SDValue NewFPSCR;
9031 if (Subtarget.isPPC64()) {
9032 if (Subtarget.isISA3_0()) {
9033 NewFPSCR = DAG.getAnyExtOrTrunc(Op: DstFlag, DL: Dl, VT: MVT::i64);
9034 } else {
9035 // Set the last two bits (rounding mode) of bitcasted FPSCR.
9036 SDNode *InsertRN = DAG.getMachineNode(
9037 Opcode: PPC::RLDIMI, dl: Dl, VT: MVT::i64,
9038 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL: Dl, VT: MVT::i64, Operand: MFFS),
9039 DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: Dl, VT: MVT::i64, Operand: DstFlag),
9040 DAG.getTargetConstant(Val: 0, DL: Dl, VT: MVT::i32),
9041 DAG.getTargetConstant(Val: 62, DL: Dl, VT: MVT::i32)});
9042 NewFPSCR = SDValue(InsertRN, 0);
9043 }
9044 NewFPSCR = DAG.getNode(Opcode: ISD::BITCAST, DL: Dl, VT: MVT::f64, Operand: NewFPSCR);
9045 } else {
9046 // In 32-bit mode, store f64, load and update the lower half.
9047 int SSFI = MF.getFrameInfo().CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9048 SDValue StackSlot = DAG.getFrameIndex(FI: SSFI, VT: PtrVT);
9049 SDValue Addr = Subtarget.isLittleEndian()
9050 ? StackSlot
9051 : DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: StackSlot,
9052 N2: DAG.getConstant(Val: 4, DL: Dl, VT: PtrVT));
9053 if (Subtarget.isISA3_0()) {
9054 Chain = DAG.getStore(Chain, dl: Dl, Val: DstFlag, Ptr: Addr, PtrInfo: MachinePointerInfo());
9055 } else {
9056 Chain = DAG.getStore(Chain, dl: Dl, Val: MFFS, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9057 SDValue Tmp =
9058 DAG.getLoad(VT: MVT::i32, dl: Dl, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo());
9059 Chain = Tmp.getValue(R: 1);
9060 Tmp = SDValue(DAG.getMachineNode(
9061 Opcode: PPC::RLWIMI, dl: Dl, VT: MVT::i32,
9062 Ops: {Tmp, DstFlag, DAG.getTargetConstant(Val: 0, DL: Dl, VT: MVT::i32),
9063 DAG.getTargetConstant(Val: 30, DL: Dl, VT: MVT::i32),
9064 DAG.getTargetConstant(Val: 31, DL: Dl, VT: MVT::i32)}),
9065 0);
9066 Chain = DAG.getStore(Chain, dl: Dl, Val: Tmp, Ptr: Addr, PtrInfo: MachinePointerInfo());
9067 }
9068 NewFPSCR =
9069 DAG.getLoad(VT: MVT::f64, dl: Dl, Chain, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9070 Chain = NewFPSCR.getValue(R: 1);
9071 }
9072 if (Subtarget.isISA3_0())
9073 return SDValue(DAG.getMachineNode(Opcode: PPC::MFFSCRN, dl: Dl, ResultTys: {MVT::f64, MVT::Other},
9074 Ops: {NewFPSCR, Chain}),
9075 1);
9076 SDValue Zero = DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i32, isTarget: true);
9077 SDNode *MTFSF = DAG.getMachineNode(
9078 Opcode: PPC::MTFSF, dl: Dl, VT: MVT::Other,
9079 Ops: {DAG.getConstant(Val: 255, DL: Dl, VT: MVT::i32, isTarget: true), NewFPSCR, Zero, Zero, Chain});
9080 return SDValue(MTFSF, 0);
9081}
9082
9083SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op,
9084 SelectionDAG &DAG) const {
9085 SDLoc dl(Op);
9086 /*
9087 The rounding mode is in bits 30:31 of FPSR, and has the following
9088 settings:
9089 00 Round to nearest
9090 01 Round to 0
9091 10 Round to +inf
9092 11 Round to -inf
9093
9094 GET_ROUNDING, on the other hand, expects the following:
9095 -1 Undefined
9096 0 Round to 0
9097 1 Round to nearest
9098 2 Round to +inf
9099 3 Round to -inf
9100
9101 To perform the conversion, we do:
9102 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
9103 */
9104
9105 MachineFunction &MF = DAG.getMachineFunction();
9106 EVT VT = Op.getValueType();
9107 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
9108
9109 // Save FP Control Word to register
9110 SDValue Chain = Op.getOperand(i: 0);
9111 SDValue MFFS = DAG.getNode(Opcode: PPCISD::MFFS, DL: dl, ResultTys: {MVT::f64, MVT::Other}, Ops: Chain);
9112 Chain = MFFS.getValue(R: 1);
9113
9114 SDValue CWD;
9115 if (isTypeLegal(VT: MVT::i64)) {
9116 CWD = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32,
9117 Operand: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: MFFS));
9118 } else {
9119 // Save FP register to stack slot
9120 int SSFI = MF.getFrameInfo().CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9121 SDValue StackSlot = DAG.getFrameIndex(FI: SSFI, VT: PtrVT);
9122 Chain = DAG.getStore(Chain, dl, Val: MFFS, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9123
9124 // Load FP Control Word from low 32 bits of stack slot.
9125 assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
9126 "Stack slot adjustment is valid only on big endian subtargets!");
9127 SDValue Four = DAG.getConstant(Val: 4, DL: dl, VT: PtrVT);
9128 SDValue Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackSlot, N2: Four);
9129 CWD = DAG.getLoad(VT: MVT::i32, dl, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo());
9130 Chain = CWD.getValue(R: 1);
9131 }
9132
9133 // Transform as necessary
9134 SDValue CWD1 =
9135 DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32,
9136 N1: CWD, N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32));
9137 SDValue CWD2 =
9138 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32,
9139 N1: DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32,
9140 N1: DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32,
9141 N1: CWD, N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32)),
9142 N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32)),
9143 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
9144
9145 SDValue RetVal =
9146 DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32, N1: CWD1, N2: CWD2);
9147
9148 RetVal =
9149 DAG.getNode(Opcode: (VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
9150 DL: dl, VT, Operand: RetVal);
9151
9152 return DAG.getMergeValues(Ops: {RetVal, Chain}, dl);
9153}
9154
9155SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9156 EVT VT = Op.getValueType();
9157 uint64_t BitWidth = VT.getSizeInBits();
9158 SDLoc dl(Op);
9159 assert(Op.getNumOperands() == 3 &&
9160 VT == Op.getOperand(1).getValueType() &&
9161 "Unexpected SHL!");
9162
9163 // Expand into a bunch of logical ops. Note that these ops
9164 // depend on the PPC behavior for oversized shift amounts.
9165 SDValue Lo = Op.getOperand(i: 0);
9166 SDValue Hi = Op.getOperand(i: 1);
9167 SDValue Amt = Op.getOperand(i: 2);
9168 EVT AmtVT = Amt.getValueType();
9169
9170 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9171 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9172 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Amt);
9173 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Tmp1);
9174 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR , DL: dl, VT, N1: Tmp2, N2: Tmp3);
9175 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9176 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9177 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Lo, N2: Tmp5);
9178 SDValue OutHi = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp4, N2: Tmp6);
9179 SDValue OutLo = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Lo, N2: Amt);
9180 SDValue OutOps[] = { OutLo, OutHi };
9181 return DAG.getMergeValues(Ops: OutOps, dl);
9182}
9183
9184SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9185 EVT VT = Op.getValueType();
9186 SDLoc dl(Op);
9187 uint64_t BitWidth = VT.getSizeInBits();
9188 assert(Op.getNumOperands() == 3 &&
9189 VT == Op.getOperand(1).getValueType() &&
9190 "Unexpected SRL!");
9191
9192 // Expand into a bunch of logical ops. Note that these ops
9193 // depend on the PPC behavior for oversized shift amounts.
9194 SDValue Lo = Op.getOperand(i: 0);
9195 SDValue Hi = Op.getOperand(i: 1);
9196 SDValue Amt = Op.getOperand(i: 2);
9197 EVT AmtVT = Amt.getValueType();
9198
9199 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9200 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9201 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Amt);
9202 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Tmp1);
9203 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp2, N2: Tmp3);
9204 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9205 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9206 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Hi, N2: Tmp5);
9207 SDValue OutLo = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp4, N2: Tmp6);
9208 SDValue OutHi = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Hi, N2: Amt);
9209 SDValue OutOps[] = { OutLo, OutHi };
9210 return DAG.getMergeValues(Ops: OutOps, dl);
9211}
9212
9213SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
9214 SDLoc dl(Op);
9215 EVT VT = Op.getValueType();
9216 uint64_t BitWidth = VT.getSizeInBits();
9217 assert(Op.getNumOperands() == 3 &&
9218 VT == Op.getOperand(1).getValueType() &&
9219 "Unexpected SRA!");
9220
9221 // Expand into a bunch of logical ops, followed by a select_cc.
9222 SDValue Lo = Op.getOperand(i: 0);
9223 SDValue Hi = Op.getOperand(i: 1);
9224 SDValue Amt = Op.getOperand(i: 2);
9225 EVT AmtVT = Amt.getValueType();
9226
9227 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9228 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9229 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Amt);
9230 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Tmp1);
9231 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp2, N2: Tmp3);
9232 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9233 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9234 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SRA, DL: dl, VT, N1: Hi, N2: Tmp5);
9235 SDValue OutHi = DAG.getNode(Opcode: PPCISD::SRA, DL: dl, VT, N1: Hi, N2: Amt);
9236 SDValue OutLo = DAG.getSelectCC(DL: dl, LHS: Tmp5, RHS: DAG.getConstant(Val: 0, DL: dl, VT: AmtVT),
9237 True: Tmp4, False: Tmp6, Cond: ISD::SETLE);
9238 SDValue OutOps[] = { OutLo, OutHi };
9239 return DAG.getMergeValues(Ops: OutOps, dl);
9240}
9241
9242SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
9243 SelectionDAG &DAG) const {
9244 SDLoc dl(Op);
9245 EVT VT = Op.getValueType();
9246 unsigned BitWidth = VT.getSizeInBits();
9247
9248 bool IsFSHL = Op.getOpcode() == ISD::FSHL;
9249 SDValue X = Op.getOperand(i: 0);
9250 SDValue Y = Op.getOperand(i: 1);
9251 SDValue Z = Op.getOperand(i: 2);
9252 EVT AmtVT = Z.getValueType();
9253
9254 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
9255 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
9256 // This is simpler than TargetLowering::expandFunnelShift because we can rely
9257 // on PowerPC shift by BW being well defined.
9258 Z = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: AmtVT, N1: Z,
9259 N2: DAG.getConstant(Val: BitWidth - 1, DL: dl, VT: AmtVT));
9260 SDValue SubZ =
9261 DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT, N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Z);
9262 X = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: X, N2: IsFSHL ? Z : SubZ);
9263 Y = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Y, N2: IsFSHL ? SubZ : Z);
9264 return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: X, N2: Y);
9265}
9266
9267//===----------------------------------------------------------------------===//
9268// Vector related lowering.
9269//
9270
9271/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
9272/// element size of SplatSize. Cast the result to VT.
9273static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
9274 SelectionDAG &DAG, const SDLoc &dl) {
9275 static const MVT VTys[] = { // canonical VT to use for each size.
9276 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
9277 };
9278
9279 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
9280
9281 // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
9282 if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
9283 SplatSize = 1;
9284 Val = 0xFF;
9285 }
9286
9287 EVT CanonicalVT = VTys[SplatSize-1];
9288
9289 // Build a canonical splat for this value.
9290 // Explicitly truncate APInt here, as this API is used with a mix of
9291 // signed and unsigned values.
9292 return DAG.getBitcast(
9293 VT: ReqVT,
9294 V: DAG.getConstant(Val: APInt(64, Val).trunc(width: SplatSize * 8), DL: dl, VT: CanonicalVT));
9295}
9296
9297/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
9298/// specified intrinsic ID.
9299static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
9300 const SDLoc &dl, EVT DestVT = MVT::Other) {
9301 if (DestVT == MVT::Other) DestVT = Op.getValueType();
9302 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9303 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: Op);
9304}
9305
9306/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
9307/// specified intrinsic ID.
9308static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
9309 SelectionDAG &DAG, const SDLoc &dl,
9310 EVT DestVT = MVT::Other) {
9311 if (DestVT == MVT::Other) DestVT = LHS.getValueType();
9312 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9313 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: LHS, N3: RHS);
9314}
9315
9316/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
9317/// specified intrinsic ID.
9318static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
9319 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
9320 EVT DestVT = MVT::Other) {
9321 if (DestVT == MVT::Other) DestVT = Op0.getValueType();
9322 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9323 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: Op0, N3: Op1, N4: Op2);
9324}
9325
9326/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
9327/// amount. The result has the specified value type.
9328static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
9329 SelectionDAG &DAG, const SDLoc &dl) {
9330 // Force LHS/RHS to be the right type.
9331 LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: LHS);
9332 RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: RHS);
9333
9334 int Ops[16];
9335 for (unsigned i = 0; i != 16; ++i)
9336 Ops[i] = i + Amt;
9337 SDValue T = DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: LHS, N2: RHS, Mask: Ops);
9338 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: T);
9339}
9340
9341/// Do we have an efficient pattern in a .td file for this node?
9342///
9343/// \param V - pointer to the BuildVectorSDNode being matched
9344/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
9345///
9346/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
9347/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
9348/// the opposite is true (expansion is beneficial) are:
9349/// - The node builds a vector out of integers that are not 32 or 64-bits
9350/// - The node builds a vector out of constants
9351/// - The node is a "load-and-splat"
9352/// In all other cases, we will choose to keep the BUILD_VECTOR.
9353static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
9354 bool HasDirectMove,
9355 bool HasP8Vector) {
9356 EVT VecVT = V->getValueType(ResNo: 0);
9357 bool RightType = VecVT == MVT::v2f64 ||
9358 (HasP8Vector && VecVT == MVT::v4f32) ||
9359 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
9360 if (!RightType)
9361 return false;
9362
9363 bool IsSplat = true;
9364 bool IsLoad = false;
9365 SDValue Op0 = V->getOperand(Num: 0);
9366
9367 // This function is called in a block that confirms the node is not a constant
9368 // splat. So a constant BUILD_VECTOR here means the vector is built out of
9369 // different constants.
9370 if (V->isConstant())
9371 return false;
9372 for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
9373 if (V->getOperand(Num: i).isUndef())
9374 return false;
9375 // We want to expand nodes that represent load-and-splat even if the
9376 // loaded value is a floating point truncation or conversion to int.
9377 if (V->getOperand(Num: i).getOpcode() == ISD::LOAD ||
9378 (V->getOperand(Num: i).getOpcode() == ISD::FP_ROUND &&
9379 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD) ||
9380 (V->getOperand(Num: i).getOpcode() == ISD::FP_TO_SINT &&
9381 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD) ||
9382 (V->getOperand(Num: i).getOpcode() == ISD::FP_TO_UINT &&
9383 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD))
9384 IsLoad = true;
9385 // If the operands are different or the input is not a load and has more
9386 // uses than just this BV node, then it isn't a splat.
9387 if (V->getOperand(Num: i) != Op0 ||
9388 (!IsLoad && !V->isOnlyUserOf(N: V->getOperand(Num: i).getNode())))
9389 IsSplat = false;
9390 }
9391 return !(IsSplat && IsLoad);
9392}
9393
9394// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
9395SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
9396
9397 SDLoc dl(Op);
9398 SDValue Op0 = Op->getOperand(Num: 0);
9399
9400 if (!Subtarget.isPPC64() || (Op0.getOpcode() != ISD::BUILD_PAIR) ||
9401 (Op.getValueType() != MVT::f128))
9402 return SDValue();
9403
9404 SDValue Lo = Op0.getOperand(i: 0);
9405 SDValue Hi = Op0.getOperand(i: 1);
9406 if ((Lo.getValueType() != MVT::i64) || (Hi.getValueType() != MVT::i64))
9407 return SDValue();
9408
9409 if (!Subtarget.isLittleEndian())
9410 std::swap(a&: Lo, b&: Hi);
9411
9412 return DAG.getNode(Opcode: PPCISD::BUILD_FP128, DL: dl, VT: MVT::f128, N1: Lo, N2: Hi);
9413}
9414
9415static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
9416 const SDValue *InputLoad = &Op;
9417 while (InputLoad->getOpcode() == ISD::BITCAST)
9418 InputLoad = &InputLoad->getOperand(i: 0);
9419 if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9420 InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9421 IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
9422 InputLoad = &InputLoad->getOperand(i: 0);
9423 }
9424 if (InputLoad->getOpcode() != ISD::LOAD)
9425 return nullptr;
9426 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
9427 return ISD::isNormalLoad(N: LD) ? InputLoad : nullptr;
9428}
9429
9430// Convert the argument APFloat to a single precision APFloat if there is no
9431// loss in information during the conversion to single precision APFloat and the
9432// resulting number is not a denormal number. Return true if successful.
9433bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
9434 APFloat APFloatToConvert = ArgAPFloat;
9435 bool LosesInfo = true;
9436 APFloatToConvert.convert(ToSemantics: APFloat::IEEEsingle(), RM: APFloat::rmNearestTiesToEven,
9437 losesInfo: &LosesInfo);
9438 bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
9439 if (Success)
9440 ArgAPFloat = APFloatToConvert;
9441 return Success;
9442}
9443
9444// Bitcast the argument APInt to a double and convert it to a single precision
9445// APFloat, bitcast the APFloat to an APInt and assign it to the original
9446// argument if there is no loss in information during the conversion from
9447// double to single precision APFloat and the resulting number is not a denormal
9448// number. Return true if successful.
9449bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
9450 double DpValue = ArgAPInt.bitsToDouble();
9451 APFloat APFloatDp(DpValue);
9452 bool Success = convertToNonDenormSingle(ArgAPFloat&: APFloatDp);
9453 if (Success)
9454 ArgAPInt = APFloatDp.bitcastToAPInt();
9455 return Success;
9456}
9457
9458// Nondestructive check for convertTonNonDenormSingle.
9459bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
9460 // Only convert if it loses info, since XXSPLTIDP should
9461 // handle the other case.
9462 APFloat APFloatToConvert = ArgAPFloat;
9463 bool LosesInfo = true;
9464 APFloatToConvert.convert(ToSemantics: APFloat::IEEEsingle(), RM: APFloat::rmNearestTiesToEven,
9465 losesInfo: &LosesInfo);
9466
9467 return (!LosesInfo && !APFloatToConvert.isDenormal());
9468}
9469
9470static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
9471 unsigned &Opcode) {
9472 LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Val: Op.getOperand(i: 0));
9473 if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(N: InputNode))
9474 return false;
9475
9476 EVT Ty = Op->getValueType(ResNo: 0);
9477 // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
9478 // as we cannot handle extending loads for these types.
9479 if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
9480 ISD::isNON_EXTLoad(N: InputNode))
9481 return true;
9482
9483 EVT MemVT = InputNode->getMemoryVT();
9484 // For v8i16 and v16i8 types, extending loads can be handled as long as the
9485 // memory VT is the same vector element VT type.
9486 // The loads feeding into the v8i16 and v16i8 types will be extending because
9487 // scalar i8/i16 are not legal types.
9488 if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(N: InputNode) &&
9489 (MemVT == Ty.getVectorElementType()))
9490 return true;
9491
9492 if (Ty == MVT::v2i64) {
9493 // Check the extend type, when the input type is i32, and the output vector
9494 // type is v2i64.
9495 if (MemVT == MVT::i32) {
9496 if (ISD::isZEXTLoad(N: InputNode))
9497 Opcode = PPCISD::ZEXT_LD_SPLAT;
9498 if (ISD::isSEXTLoad(N: InputNode))
9499 Opcode = PPCISD::SEXT_LD_SPLAT;
9500 }
9501 return true;
9502 }
9503 return false;
9504}
9505
9506bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN,
9507 bool IsLittleEndian) {
9508 assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector");
9509
9510 BitMask.clearAllBits();
9511 EVT VT = BVN.getValueType(ResNo: 0);
9512 unsigned VTSize = VT.getSizeInBits();
9513 APInt ConstValue(VTSize, 0);
9514
9515 unsigned EltWidth = VT.getScalarSizeInBits();
9516
9517 unsigned BitPos = 0;
9518 for (auto OpVal : BVN.op_values()) {
9519 auto *CN = dyn_cast<ConstantSDNode>(Val&: OpVal);
9520
9521 if (!CN)
9522 return false;
9523 // The elements in a vector register are ordered in reverse byte order
9524 // between little-endian and big-endian modes.
9525 ConstValue.insertBits(SubBits: CN->getAPIntValue().zextOrTrunc(width: EltWidth),
9526 bitPosition: IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos);
9527 BitPos += EltWidth;
9528 }
9529
9530 for (unsigned J = 0; J < 16; ++J) {
9531 APInt ExtractValue = ConstValue.extractBits(numBits: 8, bitPosition: J * 8);
9532 if (ExtractValue != 0x00 && ExtractValue != 0xFF)
9533 return false;
9534 if (ExtractValue == 0xFF)
9535 BitMask.setBit(J);
9536 }
9537 return true;
9538}
9539
9540// If this is a case we can't handle, return null and let the default
9541// expansion code take care of it. If we CAN select this case, and if it
9542// selects to a single instruction, return Op. Otherwise, if we can codegen
9543// this case more efficiently than a constant pool load, lower it to the
9544// sequence of ops that should be used.
9545SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
9546 SelectionDAG &DAG) const {
9547 SDLoc dl(Op);
9548 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
9549 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
9550
9551 if (Subtarget.hasP10Vector()) {
9552 APInt BitMask(32, 0);
9553 // If the value of the vector is all zeros or all ones,
9554 // we do not convert it to MTVSRBMI.
9555 // The xxleqv instruction sets a vector with all ones.
9556 // The xxlxor instruction sets a vector with all zeros.
9557 if (isValidMtVsrBmi(BitMask, BVN&: *BVN, IsLittleEndian: Subtarget.isLittleEndian()) &&
9558 BitMask != 0 && BitMask != 0xffff) {
9559 SDValue SDConstant = DAG.getTargetConstant(Val: BitMask, DL: dl, VT: MVT::i32);
9560 MachineSDNode *MSDNode =
9561 DAG.getMachineNode(Opcode: PPC::MTVSRBMI, dl, VT: MVT::v16i8, Op1: SDConstant);
9562 SDValue SDV = SDValue(MSDNode, 0);
9563 EVT DVT = BVN->getValueType(ResNo: 0);
9564 EVT SVT = SDV.getValueType();
9565 if (SVT != DVT) {
9566 SDV = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: DVT, Operand: SDV);
9567 }
9568 return SDV;
9569 }
9570 // Recognize build vector patterns to emit VSX vector instructions
9571 // instead of loading value from memory.
9572 if (SDValue VecPat = combineBVLoadsSpecialValue(Operand: Op, DAG))
9573 return VecPat;
9574 }
9575 // Check if this is a splat of a constant value.
9576 APInt APSplatBits, APSplatUndef;
9577 unsigned SplatBitSize;
9578 bool HasAnyUndefs;
9579 bool BVNIsConstantSplat =
9580 BVN->isConstantSplat(SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize,
9581 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian());
9582
9583 // If it is a splat of a double, check if we can shrink it to a 32 bit
9584 // non-denormal float which when converted back to double gives us the same
9585 // double. This is to exploit the XXSPLTIDP instruction.
9586 // If we lose precision, we use XXSPLTI32DX.
9587 if (BVNIsConstantSplat && (SplatBitSize == 64) &&
9588 Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
9589 // Check the type first to short-circuit so we don't modify APSplatBits if
9590 // this block isn't executed.
9591 if ((Op->getValueType(ResNo: 0) == MVT::v2f64) &&
9592 convertToNonDenormSingle(ArgAPInt&: APSplatBits)) {
9593 SDValue SplatNode = DAG.getNode(
9594 Opcode: PPCISD::XXSPLTI_SP_TO_DP, DL: dl, VT: MVT::v2f64,
9595 Operand: DAG.getTargetConstant(Val: APSplatBits.getZExtValue(), DL: dl, VT: MVT::i32));
9596 return DAG.getBitcast(VT: Op.getValueType(), V: SplatNode);
9597 } else {
9598 // We may lose precision, so we have to use XXSPLTI32DX.
9599
9600 uint32_t Hi = Hi_32(Value: APSplatBits.getZExtValue());
9601 uint32_t Lo = Lo_32(Value: APSplatBits.getZExtValue());
9602 SDValue SplatNode = DAG.getUNDEF(VT: MVT::v2i64);
9603
9604 if (!Hi || !Lo)
9605 // If either load is 0, then we should generate XXLXOR to set to 0.
9606 SplatNode = DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::v2i64);
9607
9608 if (Hi)
9609 SplatNode = DAG.getNode(
9610 Opcode: PPCISD::XXSPLTI32DX, DL: dl, VT: MVT::v2i64, N1: SplatNode,
9611 N2: DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::i32),
9612 N3: DAG.getTargetConstant(Val: Hi, DL: dl, VT: MVT::i32));
9613
9614 if (Lo)
9615 SplatNode =
9616 DAG.getNode(Opcode: PPCISD::XXSPLTI32DX, DL: dl, VT: MVT::v2i64, N1: SplatNode,
9617 N2: DAG.getTargetConstant(Val: 1, DL: dl, VT: MVT::i32),
9618 N3: DAG.getTargetConstant(Val: Lo, DL: dl, VT: MVT::i32));
9619
9620 return DAG.getBitcast(VT: Op.getValueType(), V: SplatNode);
9621 }
9622 }
9623
9624 bool IsSplat64 = false;
9625 uint64_t SplatBits = 0;
9626 int32_t SextVal = 0;
9627 if (BVNIsConstantSplat && SplatBitSize <= 64) {
9628 SplatBits = APSplatBits.getZExtValue();
9629 if (SplatBitSize <= 32) {
9630 SextVal = SignExtend32(X: SplatBits, B: SplatBitSize);
9631 } else if (SplatBitSize == 64 && Subtarget.hasP8Altivec()) {
9632 int64_t Splat64Val = static_cast<int64_t>(SplatBits);
9633 bool P9Vector = Subtarget.hasP9Vector();
9634 int32_t Hi = P9Vector ? 127 : 15;
9635 int32_t Lo = P9Vector ? -128 : -16;
9636 IsSplat64 = Splat64Val >= Lo && Splat64Val <= Hi;
9637 SextVal = static_cast<int32_t>(SplatBits);
9638 }
9639 }
9640
9641 if (!BVNIsConstantSplat || (SplatBitSize > 32 && !IsSplat64)) {
9642 unsigned NewOpcode = PPCISD::LD_SPLAT;
9643
9644 // Handle load-and-splat patterns as we have instructions that will do this
9645 // in one go.
9646 if (DAG.isSplatValue(V: Op, AllowUndefs: true) &&
9647 isValidSplatLoad(Subtarget, Op, Opcode&: NewOpcode)) {
9648 const SDValue *InputLoad = &Op.getOperand(i: 0);
9649 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
9650
9651 // If the input load is an extending load, it will be an i32 -> i64
9652 // extending load and isValidSplatLoad() will update NewOpcode.
9653 unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();
9654 unsigned ElementSize =
9655 MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
9656
9657 assert(((ElementSize == 2 * MemorySize)
9658 ? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||
9659 NewOpcode == PPCISD::SEXT_LD_SPLAT)
9660 : (NewOpcode == PPCISD::LD_SPLAT)) &&
9661 "Unmatched element size and opcode!\n");
9662
9663 // Checking for a single use of this load, we have to check for vector
9664 // width (128 bits) / ElementSize uses (since each operand of the
9665 // BUILD_VECTOR is a separate use of the value.
9666 unsigned NumUsesOfInputLD = 128 / ElementSize;
9667 for (SDValue BVInOp : Op->ops())
9668 if (BVInOp.isUndef())
9669 NumUsesOfInputLD--;
9670
9671 // Exclude somes case where LD_SPLAT is worse than scalar_to_vector:
9672 // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
9673 // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
9674 // 15", but function IsValidSplatLoad() now will only return true when
9675 // the data at index 0 is not nullptr. So we will not get into trouble for
9676 // these cases.
9677 //
9678 // case 1 - lfiwzx/lfiwax
9679 // 1.1: load result is i32 and is sign/zero extend to i64;
9680 // 1.2: build a v2i64 vector type with above loaded value;
9681 // 1.3: the vector has only one value at index 0, others are all undef;
9682 // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
9683 if (NumUsesOfInputLD == 1 &&
9684 (Op->getValueType(ResNo: 0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
9685 !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
9686 Subtarget.hasLFIWAX()))
9687 return SDValue();
9688
9689 // case 2 - lxvr[hb]x
9690 // 2.1: load result is at most i16;
9691 // 2.2: build a vector with above loaded value;
9692 // 2.3: the vector has only one value at index 0, others are all undef;
9693 // 2.4: on LE target, so that lxvr[hb]x does not need any permute.
9694 if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
9695 Subtarget.isISA3_1() && ElementSize <= 16)
9696 return SDValue();
9697
9698 assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
9699 if (InputLoad->getNode()->hasNUsesOfValue(NUses: NumUsesOfInputLD, Value: 0) &&
9700 Subtarget.hasVSX()) {
9701 SDValue Ops[] = {
9702 LD->getChain(), // Chain
9703 LD->getBasePtr(), // Ptr
9704 DAG.getValueType(Op.getValueType()) // VT
9705 };
9706 SDValue LdSplt = DAG.getMemIntrinsicNode(
9707 Opcode: NewOpcode, dl, VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::Other), Ops,
9708 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
9709 // Replace all uses of the output chain of the original load with the
9710 // output chain of the new load.
9711 DAG.ReplaceAllUsesOfValueWith(From: InputLoad->getValue(R: 1),
9712 To: LdSplt.getValue(R: 1));
9713 return LdSplt;
9714 }
9715 }
9716
9717 // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
9718 // 32-bits can be lowered to VSX instructions under certain conditions.
9719 // Without VSX, there is no pattern more efficient than expanding the node.
9720 if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
9721 haveEfficientBuildVectorPattern(V: BVN, HasDirectMove: Subtarget.hasDirectMove(),
9722 HasP8Vector: Subtarget.hasP8Vector()))
9723 return Op;
9724 return SDValue();
9725 }
9726
9727 uint64_t SplatUndef = APSplatUndef.getZExtValue();
9728 unsigned SplatSize = SplatBitSize / 8;
9729
9730 // First, handle single instruction cases.
9731
9732 // All zeros?
9733 if (SplatBits == 0) {
9734 // Canonicalize all zero vectors to be v4i32.
9735 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
9736 SDValue Z = DAG.getConstant(Val: 0, DL: dl, VT: MVT::v4i32);
9737 Op = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Z);
9738 }
9739 return Op;
9740 }
9741
9742 // We have XXSPLTIW for constant splats four bytes wide.
9743 // Given vector length is a multiple of 4, 2-byte splats can be replaced
9744 // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
9745 // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
9746 // turned into a 4-byte splat of 0xABABABAB.
9747 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)
9748 return getCanonicalConstSplat(Val: SplatBits | (SplatBits << 16), SplatSize: SplatSize * 2,
9749 VT: Op.getValueType(), DAG, dl);
9750
9751 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)
9752 return getCanonicalConstSplat(Val: SplatBits, SplatSize, VT: Op.getValueType(), DAG,
9753 dl);
9754
9755 // We have XXSPLTIB for constant splats one byte wide.
9756 if (Subtarget.hasP9Vector() && SplatSize == 1)
9757 return getCanonicalConstSplat(Val: SplatBits, SplatSize, VT: Op.getValueType(), DAG,
9758 dl);
9759
9760 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
9761 // Use VSPLTIW/VUPKLSW for v2i64 in range [-16,15].
9762 if (SextVal >= -16 && SextVal <= 15) {
9763 // SplatSize may be 1, 2, 4, or 8. Use size 4 instead of 8 for the splat to
9764 // generate a splat word with extend for size 8.
9765 unsigned UseSize = SplatSize == 8 ? 4 : SplatSize;
9766 SDValue Res =
9767 getCanonicalConstSplat(Val: SextVal, SplatSize: UseSize, VT: Op.getValueType(), DAG, dl);
9768 if (SplatSize != 8)
9769 return Res;
9770 return BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vupklsw, Op: Res, DAG, dl);
9771 }
9772
9773 // Two instruction sequences.
9774
9775 if (Subtarget.hasP9Vector() && SextVal >= -128 && SextVal <= 127) {
9776 SDValue C = DAG.getConstant(Val: (unsigned char)SextVal, DL: dl, VT: MVT::i32);
9777 SmallVector<SDValue, 16> Ops(16, C);
9778 SDValue BV = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops);
9779 unsigned IID;
9780 EVT VT;
9781 switch (SplatSize) {
9782 default:
9783 llvm_unreachable("Unexpected type for vector constant.");
9784 case 2:
9785 IID = Intrinsic::ppc_altivec_vupklsb;
9786 VT = MVT::v8i16;
9787 break;
9788 case 4:
9789 IID = Intrinsic::ppc_altivec_vextsb2w;
9790 VT = MVT::v4i32;
9791 break;
9792 case 8:
9793 IID = Intrinsic::ppc_altivec_vextsb2d;
9794 VT = MVT::v2i64;
9795 break;
9796 }
9797 SDValue Extend = BuildIntrinsicOp(IID, Op: BV, DAG, dl, DestVT: VT);
9798 return DAG.getBitcast(VT: Op->getValueType(ResNo: 0), V: Extend);
9799 }
9800 assert(!IsSplat64 && "Unhandled 64-bit splat pattern");
9801
9802 // If this value is in the range [-32,30] and is even, use:
9803 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
9804 // If this value is in the range [17,31] and is odd, use:
9805 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
9806 // If this value is in the range [-31,-17] and is odd, use:
9807 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
9808 // Note the last two are three-instruction sequences.
9809 if (SextVal >= -32 && SextVal <= 31) {
9810 // To avoid having these optimizations undone by constant folding,
9811 // we convert to a pseudo that will be expanded later into one of
9812 // the above forms.
9813 SDValue Elt = DAG.getSignedConstant(Val: SextVal, DL: dl, VT: MVT::i32);
9814 EVT VT = (SplatSize == 1 ? MVT::v16i8 :
9815 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
9816 SDValue EltSize = DAG.getConstant(Val: SplatSize, DL: dl, VT: MVT::i32);
9817 SDValue RetVal = DAG.getNode(Opcode: PPCISD::VADD_SPLAT, DL: dl, VT, N1: Elt, N2: EltSize);
9818 if (VT == Op.getValueType())
9819 return RetVal;
9820 else
9821 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: RetVal);
9822 }
9823
9824 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
9825 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
9826 // for fneg/fabs.
9827 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
9828 // Make -1 and vspltisw -1:
9829 SDValue OnesV = getCanonicalConstSplat(Val: -1, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
9830
9831 // Make the VSLW intrinsic, computing 0x8000_0000.
9832 SDValue Res = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vslw, LHS: OnesV,
9833 RHS: OnesV, DAG, dl);
9834
9835 // xor by OnesV to invert it.
9836 Res = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::v4i32, N1: Res, N2: OnesV);
9837 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9838 }
9839
9840 // Check to see if this is a wide variety of vsplti*, binop self cases.
9841 static const signed char SplatCsts[] = {
9842 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
9843 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
9844 };
9845
9846 for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) {
9847 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
9848 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
9849 int i = SplatCsts[idx];
9850
9851 // Figure out what shift amount will be used by altivec if shifted by i in
9852 // this splat size.
9853 unsigned TypeShiftAmt = i & (SplatBitSize-1);
9854
9855 // vsplti + shl self.
9856 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
9857 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9858 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9859 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
9860 Intrinsic::ppc_altivec_vslw
9861 };
9862 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9863 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9864 }
9865
9866 // vsplti + srl self.
9867 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9868 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9869 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9870 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
9871 Intrinsic::ppc_altivec_vsrw
9872 };
9873 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9874 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9875 }
9876
9877 // vsplti + rol self.
9878 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
9879 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
9880 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9881 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9882 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
9883 Intrinsic::ppc_altivec_vrlw
9884 };
9885 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9886 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9887 }
9888
9889 // t = vsplti c, result = vsldoi t, t, 1
9890 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
9891 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9892 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
9893 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9894 }
9895 // t = vsplti c, result = vsldoi t, t, 2
9896 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
9897 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9898 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
9899 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9900 }
9901 // t = vsplti c, result = vsldoi t, t, 3
9902 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
9903 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9904 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
9905 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9906 }
9907 }
9908
9909 return SDValue();
9910}
9911
9912/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9913/// the specified operations to build the shuffle.
9914static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9915 SDValue RHS, SelectionDAG &DAG,
9916 const SDLoc &dl) {
9917 unsigned OpNum = (PFEntry >> 26) & 0x0F;
9918 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
9919 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
9920
9921 enum {
9922 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9923 OP_VMRGHW,
9924 OP_VMRGLW,
9925 OP_VSPLTISW0,
9926 OP_VSPLTISW1,
9927 OP_VSPLTISW2,
9928 OP_VSPLTISW3,
9929 OP_VSLDOI4,
9930 OP_VSLDOI8,
9931 OP_VSLDOI12
9932 };
9933
9934 if (OpNum == OP_COPY) {
9935 if (LHSID == (1*9+2)*9+3) return LHS;
9936 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
9937 return RHS;
9938 }
9939
9940 SDValue OpLHS, OpRHS;
9941 OpLHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9942 OpRHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9943
9944 int ShufIdxs[16];
9945 switch (OpNum) {
9946 default: llvm_unreachable("Unknown i32 permute!");
9947 case OP_VMRGHW:
9948 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
9949 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
9950 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
9951 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
9952 break;
9953 case OP_VMRGLW:
9954 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
9955 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
9956 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
9957 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
9958 break;
9959 case OP_VSPLTISW0:
9960 for (unsigned i = 0; i != 16; ++i)
9961 ShufIdxs[i] = (i&3)+0;
9962 break;
9963 case OP_VSPLTISW1:
9964 for (unsigned i = 0; i != 16; ++i)
9965 ShufIdxs[i] = (i&3)+4;
9966 break;
9967 case OP_VSPLTISW2:
9968 for (unsigned i = 0; i != 16; ++i)
9969 ShufIdxs[i] = (i&3)+8;
9970 break;
9971 case OP_VSPLTISW3:
9972 for (unsigned i = 0; i != 16; ++i)
9973 ShufIdxs[i] = (i&3)+12;
9974 break;
9975 case OP_VSLDOI4:
9976 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 4, VT: OpLHS.getValueType(), DAG, dl);
9977 case OP_VSLDOI8:
9978 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 8, VT: OpLHS.getValueType(), DAG, dl);
9979 case OP_VSLDOI12:
9980 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 12, VT: OpLHS.getValueType(), DAG, dl);
9981 }
9982 EVT VT = OpLHS.getValueType();
9983 OpLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OpLHS);
9984 OpRHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OpRHS);
9985 SDValue T = DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: OpLHS, N2: OpRHS, Mask: ShufIdxs);
9986 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: T);
9987}
9988
9989/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
9990/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
9991/// SDValue.
9992SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
9993 SelectionDAG &DAG) const {
9994 const unsigned BytesInVector = 16;
9995 bool IsLE = Subtarget.isLittleEndian();
9996 SDLoc dl(N);
9997 SDValue V1 = N->getOperand(Num: 0);
9998 SDValue V2 = N->getOperand(Num: 1);
9999 unsigned ShiftElts = 0, InsertAtByte = 0;
10000 bool Swap = false;
10001
10002 // Shifts required to get the byte we want at element 7.
10003 unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
10004 0, 15, 14, 13, 12, 11, 10, 9};
10005 unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
10006 1, 2, 3, 4, 5, 6, 7, 8};
10007
10008 ArrayRef<int> Mask = N->getMask();
10009 int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
10010
10011 // For each mask element, find out if we're just inserting something
10012 // from V2 into V1 or vice versa.
10013 // Possible permutations inserting an element from V2 into V1:
10014 // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
10015 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
10016 // ...
10017 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
10018 // Inserting from V1 into V2 will be similar, except mask range will be
10019 // [16,31].
10020
10021 bool FoundCandidate = false;
10022 // If both vector operands for the shuffle are the same vector, the mask
10023 // will contain only elements from the first one and the second one will be
10024 // undef.
10025 unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
10026 // Go through the mask of half-words to find an element that's being moved
10027 // from one vector to the other.
10028 for (unsigned i = 0; i < BytesInVector; ++i) {
10029 unsigned CurrentElement = Mask[i];
10030 // If 2nd operand is undefined, we should only look for element 7 in the
10031 // Mask.
10032 if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
10033 continue;
10034
10035 bool OtherElementsInOrder = true;
10036 // Examine the other elements in the Mask to see if they're in original
10037 // order.
10038 for (unsigned j = 0; j < BytesInVector; ++j) {
10039 if (j == i)
10040 continue;
10041 // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
10042 // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
10043 // in which we always assume we're always picking from the 1st operand.
10044 int MaskOffset =
10045 (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
10046 if (Mask[j] != OriginalOrder[j] + MaskOffset) {
10047 OtherElementsInOrder = false;
10048 break;
10049 }
10050 }
10051 // If other elements are in original order, we record the number of shifts
10052 // we need to get the element we want into element 7. Also record which byte
10053 // in the vector we should insert into.
10054 if (OtherElementsInOrder) {
10055 // If 2nd operand is undefined, we assume no shifts and no swapping.
10056 if (V2.isUndef()) {
10057 ShiftElts = 0;
10058 Swap = false;
10059 } else {
10060 // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
10061 ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
10062 : BigEndianShifts[CurrentElement & 0xF];
10063 Swap = CurrentElement < BytesInVector;
10064 }
10065 InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
10066 FoundCandidate = true;
10067 break;
10068 }
10069 }
10070
10071 if (!FoundCandidate)
10072 return SDValue();
10073
10074 // Candidate found, construct the proper SDAG sequence with VINSERTB,
10075 // optionally with VECSHL if shift is required.
10076 if (Swap)
10077 std::swap(a&: V1, b&: V2);
10078 if (V2.isUndef())
10079 V2 = V1;
10080 if (ShiftElts) {
10081 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v16i8, N1: V2, N2: V2,
10082 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10083 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v16i8, N1: V1, N2: Shl,
10084 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10085 }
10086 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v16i8, N1: V1, N2: V2,
10087 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10088}
10089
10090/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
10091/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
10092/// SDValue.
10093SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
10094 SelectionDAG &DAG) const {
10095 const unsigned NumHalfWords = 8;
10096 const unsigned BytesInVector = NumHalfWords * 2;
10097 // Check that the shuffle is on half-words.
10098 if (!isNByteElemShuffleMask(N, Width: 2, StepLen: 1))
10099 return SDValue();
10100
10101 bool IsLE = Subtarget.isLittleEndian();
10102 SDLoc dl(N);
10103 SDValue V1 = N->getOperand(Num: 0);
10104 SDValue V2 = N->getOperand(Num: 1);
10105 unsigned ShiftElts = 0, InsertAtByte = 0;
10106 bool Swap = false;
10107
10108 // Shifts required to get the half-word we want at element 3.
10109 unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
10110 unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
10111
10112 uint32_t Mask = 0;
10113 uint32_t OriginalOrderLow = 0x1234567;
10114 uint32_t OriginalOrderHigh = 0x89ABCDEF;
10115 // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
10116 // 32-bit space, only need 4-bit nibbles per element.
10117 for (unsigned i = 0; i < NumHalfWords; ++i) {
10118 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
10119 Mask |= ((uint32_t)(N->getMaskElt(Idx: i * 2) / 2) << MaskShift);
10120 }
10121
10122 // For each mask element, find out if we're just inserting something
10123 // from V2 into V1 or vice versa. Possible permutations inserting an element
10124 // from V2 into V1:
10125 // X, 1, 2, 3, 4, 5, 6, 7
10126 // 0, X, 2, 3, 4, 5, 6, 7
10127 // 0, 1, X, 3, 4, 5, 6, 7
10128 // 0, 1, 2, X, 4, 5, 6, 7
10129 // 0, 1, 2, 3, X, 5, 6, 7
10130 // 0, 1, 2, 3, 4, X, 6, 7
10131 // 0, 1, 2, 3, 4, 5, X, 7
10132 // 0, 1, 2, 3, 4, 5, 6, X
10133 // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
10134
10135 bool FoundCandidate = false;
10136 // Go through the mask of half-words to find an element that's being moved
10137 // from one vector to the other.
10138 for (unsigned i = 0; i < NumHalfWords; ++i) {
10139 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
10140 uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
10141 uint32_t MaskOtherElts = ~(0xF << MaskShift);
10142 uint32_t TargetOrder = 0x0;
10143
10144 // If both vector operands for the shuffle are the same vector, the mask
10145 // will contain only elements from the first one and the second one will be
10146 // undef.
10147 if (V2.isUndef()) {
10148 ShiftElts = 0;
10149 unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
10150 TargetOrder = OriginalOrderLow;
10151 Swap = false;
10152 // Skip if not the correct element or mask of other elements don't equal
10153 // to our expected order.
10154 if (MaskOneElt == VINSERTHSrcElem &&
10155 (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10156 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10157 FoundCandidate = true;
10158 break;
10159 }
10160 } else { // If both operands are defined.
10161 // Target order is [8,15] if the current mask is between [0,7].
10162 TargetOrder =
10163 (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
10164 // Skip if mask of other elements don't equal our expected order.
10165 if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10166 // We only need the last 3 bits for the number of shifts.
10167 ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
10168 : BigEndianShifts[MaskOneElt & 0x7];
10169 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10170 Swap = MaskOneElt < NumHalfWords;
10171 FoundCandidate = true;
10172 break;
10173 }
10174 }
10175 }
10176
10177 if (!FoundCandidate)
10178 return SDValue();
10179
10180 // Candidate found, construct the proper SDAG sequence with VINSERTH,
10181 // optionally with VECSHL if shift is required.
10182 if (Swap)
10183 std::swap(a&: V1, b&: V2);
10184 if (V2.isUndef())
10185 V2 = V1;
10186 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V1);
10187 if (ShiftElts) {
10188 // Double ShiftElts because we're left shifting on v16i8 type.
10189 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v16i8, N1: V2, N2: V2,
10190 N3: DAG.getConstant(Val: 2 * ShiftElts, DL: dl, VT: MVT::i32));
10191 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: Shl);
10192 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v8i16, N1: Conv1, N2: Conv2,
10193 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10194 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10195 }
10196 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V2);
10197 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v8i16, N1: Conv1, N2: Conv2,
10198 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10199 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10200}
10201
10202/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
10203/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
10204/// return the default SDValue.
10205SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
10206 SelectionDAG &DAG) const {
10207 // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
10208 // to v16i8. Peek through the bitcasts to get the actual operands.
10209 SDValue LHS = peekThroughBitcasts(V: SVN->getOperand(Num: 0));
10210 SDValue RHS = peekThroughBitcasts(V: SVN->getOperand(Num: 1));
10211
10212 auto ShuffleMask = SVN->getMask();
10213 SDValue VecShuffle(SVN, 0);
10214 SDLoc DL(SVN);
10215
10216 // Check that we have a four byte shuffle.
10217 if (!isNByteElemShuffleMask(N: SVN, Width: 4, StepLen: 1))
10218 return SDValue();
10219
10220 // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
10221 if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
10222 std::swap(a&: LHS, b&: RHS);
10223 VecShuffle = peekThroughBitcasts(V: DAG.getCommutedVectorShuffle(SV: *SVN));
10224 ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(Val&: VecShuffle);
10225 if (!CommutedSV)
10226 return SDValue();
10227 ShuffleMask = CommutedSV->getMask();
10228 }
10229
10230 // Ensure that the RHS is a vector of constants.
10231 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
10232 if (!BVN)
10233 return SDValue();
10234
10235 // Check if RHS is a splat of 4-bytes (or smaller).
10236 APInt APSplatValue, APSplatUndef;
10237 unsigned SplatBitSize;
10238 bool HasAnyUndefs;
10239 if (!BVN->isConstantSplat(SplatValue&: APSplatValue, SplatUndef&: APSplatUndef, SplatBitSize,
10240 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian()) ||
10241 SplatBitSize > 32)
10242 return SDValue();
10243
10244 // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
10245 // The instruction splats a constant C into two words of the source vector
10246 // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
10247 // Thus we check that the shuffle mask is the equivalent of
10248 // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
10249 // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
10250 // within each word are consecutive, so we only need to check the first byte.
10251 SDValue Index;
10252 bool IsLE = Subtarget.isLittleEndian();
10253 if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
10254 (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
10255 ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
10256 Index = DAG.getTargetConstant(Val: IsLE ? 0 : 1, DL, VT: MVT::i32);
10257 else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
10258 (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
10259 ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
10260 Index = DAG.getTargetConstant(Val: IsLE ? 1 : 0, DL, VT: MVT::i32);
10261 else
10262 return SDValue();
10263
10264 // If the splat is narrower than 32-bits, we need to get the 32-bit value
10265 // for XXSPLTI32DX.
10266 unsigned SplatVal = APSplatValue.getZExtValue();
10267 for (; SplatBitSize < 32; SplatBitSize <<= 1)
10268 SplatVal |= (SplatVal << SplatBitSize);
10269
10270 SDValue SplatNode = DAG.getNode(
10271 Opcode: PPCISD::XXSPLTI32DX, DL, VT: MVT::v2i64, N1: DAG.getBitcast(VT: MVT::v2i64, V: LHS),
10272 N2: Index, N3: DAG.getTargetConstant(Val: SplatVal, DL, VT: MVT::i32));
10273 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: SplatNode);
10274}
10275
10276/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
10277/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
10278/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
10279/// i.e (or (shl x, C1), (srl x, 128-C1)).
10280SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
10281 assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
10282 assert(Op.getValueType() == MVT::v1i128 &&
10283 "Only set v1i128 as custom, other type shouldn't reach here!");
10284 SDLoc dl(Op);
10285 SDValue N0 = peekThroughBitcasts(V: Op.getOperand(i: 0));
10286 SDValue N1 = peekThroughBitcasts(V: Op.getOperand(i: 1));
10287 unsigned SHLAmt = N1.getConstantOperandVal(i: 0);
10288 if (SHLAmt % 8 == 0) {
10289 std::array<int, 16> Mask;
10290 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
10291 std::rotate(first: Mask.begin(), middle: Mask.begin() + SHLAmt / 8, last: Mask.end());
10292 if (SDValue Shuffle =
10293 DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: DAG.getBitcast(VT: MVT::v16i8, V: N0),
10294 N2: DAG.getUNDEF(VT: MVT::v16i8), Mask))
10295 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: Shuffle);
10296 }
10297 SDValue ArgVal = DAG.getBitcast(VT: MVT::i128, V: N0);
10298 SDValue SHLOp = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i128, N1: ArgVal,
10299 N2: DAG.getConstant(Val: SHLAmt, DL: dl, VT: MVT::i32));
10300 SDValue SRLOp = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i128, N1: ArgVal,
10301 N2: DAG.getConstant(Val: 128 - SHLAmt, DL: dl, VT: MVT::i32));
10302 SDValue OROp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MVT::i128, N1: SHLOp, N2: SRLOp);
10303 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: OROp);
10304}
10305
10306/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
10307/// is a shuffle we can handle in a single instruction, return it. Otherwise,
10308/// return the code it can be lowered into. Worst case, it can always be
10309/// lowered into a vperm.
10310SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
10311 SelectionDAG &DAG) const {
10312 SDLoc dl(Op);
10313 SDValue V1 = Op.getOperand(i: 0);
10314 SDValue V2 = Op.getOperand(i: 1);
10315 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val&: Op);
10316
10317 // Any nodes that were combined in the target-independent combiner prior
10318 // to vector legalization will not be sent to the target combine. Try to
10319 // combine it here.
10320 if (SDValue NewShuffle = combineVectorShuffle(SVN: SVOp, DAG)) {
10321 if (!isa<ShuffleVectorSDNode>(Val: NewShuffle))
10322 return NewShuffle;
10323 Op = NewShuffle;
10324 SVOp = cast<ShuffleVectorSDNode>(Val&: Op);
10325 V1 = Op.getOperand(i: 0);
10326 V2 = Op.getOperand(i: 1);
10327 }
10328 EVT VT = Op.getValueType();
10329 bool isLittleEndian = Subtarget.isLittleEndian();
10330
10331 unsigned ShiftElts, InsertAtByte;
10332 bool Swap = false;
10333
10334 // If this is a load-and-splat, we can do that with a single instruction
10335 // in some cases. However if the load has multiple uses, we don't want to
10336 // combine it because that will just produce multiple loads.
10337 bool IsPermutedLoad = false;
10338 const SDValue *InputLoad = getNormalLoadInput(Op: V1, IsPermuted&: IsPermutedLoad);
10339 if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
10340 (PPC::isSplatShuffleMask(N: SVOp, EltSize: 4) || PPC::isSplatShuffleMask(N: SVOp, EltSize: 8)) &&
10341 InputLoad->hasOneUse()) {
10342 bool IsFourByte = PPC::isSplatShuffleMask(N: SVOp, EltSize: 4);
10343 int SplatIdx =
10344 PPC::getSplatIdxForPPCMnemonics(N: SVOp, EltSize: IsFourByte ? 4 : 8, DAG);
10345
10346 // The splat index for permuted loads will be in the left half of the vector
10347 // which is strictly wider than the loaded value by 8 bytes. So we need to
10348 // adjust the splat index to point to the correct address in memory.
10349 if (IsPermutedLoad) {
10350 assert((isLittleEndian || IsFourByte) &&
10351 "Unexpected size for permuted load on big endian target");
10352 SplatIdx += IsFourByte ? 2 : 1;
10353 assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
10354 "Splat of a value outside of the loaded memory");
10355 }
10356
10357 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
10358 // For 4-byte load-and-splat, we need Power9.
10359 if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
10360 uint64_t Offset = 0;
10361 if (IsFourByte)
10362 Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
10363 else
10364 Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
10365
10366 // If the width of the load is the same as the width of the splat,
10367 // loading with an offset would load the wrong memory.
10368 if (LD->getValueType(ResNo: 0).getSizeInBits() == (IsFourByte ? 32 : 64))
10369 Offset = 0;
10370
10371 SDValue BasePtr = LD->getBasePtr();
10372 if (Offset != 0)
10373 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
10374 N1: BasePtr, N2: DAG.getIntPtrConstant(Val: Offset, DL: dl));
10375 SDValue Ops[] = {
10376 LD->getChain(), // Chain
10377 BasePtr, // BasePtr
10378 DAG.getValueType(Op.getValueType()) // VT
10379 };
10380 SDVTList VTL =
10381 DAG.getVTList(VT1: IsFourByte ? MVT::v4i32 : MVT::v2i64, VT2: MVT::Other);
10382 SDValue LdSplt =
10383 DAG.getMemIntrinsicNode(Opcode: PPCISD::LD_SPLAT, dl, VTList: VTL,
10384 Ops, MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
10385 DAG.ReplaceAllUsesOfValueWith(From: InputLoad->getValue(R: 1), To: LdSplt.getValue(R: 1));
10386 if (LdSplt.getValueType() != SVOp->getValueType(ResNo: 0))
10387 LdSplt = DAG.getBitcast(VT: SVOp->getValueType(ResNo: 0), V: LdSplt);
10388 return LdSplt;
10389 }
10390 }
10391
10392 // All v2i64 and v2f64 shuffles are legal
10393 if (VT == MVT::v2i64 || VT == MVT::v2f64)
10394 return Op;
10395
10396 if (Subtarget.hasP9Vector() &&
10397 PPC::isXXINSERTWMask(N: SVOp, ShiftElts, InsertAtByte, Swap,
10398 IsLE: isLittleEndian)) {
10399 if (V2.isUndef())
10400 V2 = V1;
10401 else if (Swap)
10402 std::swap(a&: V1, b&: V2);
10403 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10404 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V2);
10405 if (ShiftElts) {
10406 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v4i32, N1: Conv2, N2: Conv2,
10407 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10408 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Shl,
10409 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10410 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10411 }
10412 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Conv2,
10413 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10414 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10415 }
10416
10417 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
10418 SDValue SplatInsertNode;
10419 if ((SplatInsertNode = lowerToXXSPLTI32DX(SVN: SVOp, DAG)))
10420 return SplatInsertNode;
10421 }
10422
10423 if (Subtarget.hasP9Altivec()) {
10424 SDValue NewISDNode;
10425 if ((NewISDNode = lowerToVINSERTH(N: SVOp, DAG)))
10426 return NewISDNode;
10427
10428 if ((NewISDNode = lowerToVINSERTB(N: SVOp, DAG)))
10429 return NewISDNode;
10430 }
10431
10432 if (Subtarget.hasVSX() &&
10433 PPC::isXXSLDWIShuffleMask(N: SVOp, ShiftElts, Swap, IsLE: isLittleEndian)) {
10434 if (Swap)
10435 std::swap(a&: V1, b&: V2);
10436 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10437 SDValue Conv2 =
10438 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V2.isUndef() ? V1 : V2);
10439
10440 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Conv2,
10441 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10442 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Shl);
10443 }
10444
10445 if (Subtarget.hasVSX() &&
10446 PPC::isXXPERMDIShuffleMask(N: SVOp, DM&: ShiftElts, Swap, IsLE: isLittleEndian)) {
10447 if (Swap)
10448 std::swap(a&: V1, b&: V2);
10449 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V1);
10450 SDValue Conv2 =
10451 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V2.isUndef() ? V1 : V2);
10452
10453 SDValue PermDI = DAG.getNode(Opcode: PPCISD::XXPERMDI, DL: dl, VT: MVT::v2i64, N1: Conv1, N2: Conv2,
10454 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10455 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: PermDI);
10456 }
10457
10458 if (Subtarget.hasP9Vector()) {
10459 if (PPC::isXXBRHShuffleMask(N: SVOp)) {
10460 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V1);
10461 SDValue ReveHWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v8i16, Operand: Conv);
10462 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveHWord);
10463 } else if (PPC::isXXBRWShuffleMask(N: SVOp)) {
10464 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10465 SDValue ReveWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v4i32, Operand: Conv);
10466 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveWord);
10467 } else if (PPC::isXXBRDShuffleMask(N: SVOp)) {
10468 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V1);
10469 SDValue ReveDWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v2i64, Operand: Conv);
10470 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveDWord);
10471 } else if (PPC::isXXBRQShuffleMask(N: SVOp)) {
10472 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: V1);
10473 SDValue ReveQWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v1i128, Operand: Conv);
10474 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveQWord);
10475 }
10476 }
10477
10478 if (Subtarget.hasVSX()) {
10479 if (V2.isUndef() && PPC::isSplatShuffleMask(N: SVOp, EltSize: 4)) {
10480 int SplatIdx = PPC::getSplatIdxForPPCMnemonics(N: SVOp, EltSize: 4, DAG);
10481
10482 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10483 SDValue Splat = DAG.getNode(Opcode: PPCISD::XXSPLT, DL: dl, VT: MVT::v4i32, N1: Conv,
10484 N2: DAG.getConstant(Val: SplatIdx, DL: dl, VT: MVT::i32));
10485 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Splat);
10486 }
10487
10488 // Left shifts of 8 bytes are actually swaps. Convert accordingly.
10489 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind: 1, DAG) == 8) {
10490 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2f64, Operand: V1);
10491 SDValue Swap = DAG.getNode(Opcode: PPCISD::SWAP_NO_CHAIN, DL: dl, VT: MVT::v2f64, Operand: Conv);
10492 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Swap);
10493 }
10494 }
10495
10496 // Cases that are handled by instructions that take permute immediates
10497 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
10498 // selected by the instruction selector.
10499 if (V2.isUndef()) {
10500 if (PPC::isSplatShuffleMask(N: SVOp, EltSize: 1) ||
10501 PPC::isSplatShuffleMask(N: SVOp, EltSize: 2) ||
10502 PPC::isSplatShuffleMask(N: SVOp, EltSize: 4) ||
10503 PPC::isVPKUWUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10504 PPC::isVPKUHUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10505 PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind: 1, DAG) != -1 ||
10506 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind: 1, DAG) ||
10507 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind: 1, DAG) ||
10508 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind: 1, DAG) ||
10509 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind: 1, DAG) ||
10510 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind: 1, DAG) ||
10511 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind: 1, DAG) ||
10512 (Subtarget.hasP8Altivec() && (
10513 PPC::isVPKUDUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10514 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: true, ShuffleKind: 1, DAG) ||
10515 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: false, ShuffleKind: 1, DAG)))) {
10516 return Op;
10517 }
10518 }
10519
10520 // Altivec has a variety of "shuffle immediates" that take two vector inputs
10521 // and produce a fixed permutation. If any of these match, do not lower to
10522 // VPERM.
10523 unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
10524 if (PPC::isVPKUWUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10525 PPC::isVPKUHUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10526 PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind, DAG) != -1 ||
10527 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind, DAG) ||
10528 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind, DAG) ||
10529 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind, DAG) ||
10530 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind, DAG) ||
10531 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind, DAG) ||
10532 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind, DAG) ||
10533 (Subtarget.hasP8Altivec() && (
10534 PPC::isVPKUDUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10535 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: true, ShuffleKind, DAG) ||
10536 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: false, ShuffleKind, DAG))))
10537 return Op;
10538
10539 // Check to see if this is a shuffle of 4-byte values. If so, we can use our
10540 // perfect shuffle table to emit an optimal matching sequence.
10541 ArrayRef<int> PermMask = SVOp->getMask();
10542
10543 if (!DisablePerfectShuffle && !isLittleEndian) {
10544 unsigned PFIndexes[4];
10545 bool isFourElementShuffle = true;
10546 for (unsigned i = 0; i != 4 && isFourElementShuffle;
10547 ++i) { // Element number
10548 unsigned EltNo = 8; // Start out undef.
10549 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
10550 if (PermMask[i * 4 + j] < 0)
10551 continue; // Undef, ignore it.
10552
10553 unsigned ByteSource = PermMask[i * 4 + j];
10554 if ((ByteSource & 3) != j) {
10555 isFourElementShuffle = false;
10556 break;
10557 }
10558
10559 if (EltNo == 8) {
10560 EltNo = ByteSource / 4;
10561 } else if (EltNo != ByteSource / 4) {
10562 isFourElementShuffle = false;
10563 break;
10564 }
10565 }
10566 PFIndexes[i] = EltNo;
10567 }
10568
10569 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
10570 // perfect shuffle vector to determine if it is cost effective to do this as
10571 // discrete instructions, or whether we should use a vperm.
10572 // For now, we skip this for little endian until such time as we have a
10573 // little-endian perfect shuffle table.
10574 if (isFourElementShuffle) {
10575 // Compute the index in the perfect shuffle table.
10576 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10577 PFIndexes[2] * 9 + PFIndexes[3];
10578
10579 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10580 unsigned Cost = (PFEntry >> 30);
10581
10582 // Determining when to avoid vperm is tricky. Many things affect the cost
10583 // of vperm, particularly how many times the perm mask needs to be
10584 // computed. For example, if the perm mask can be hoisted out of a loop or
10585 // is already used (perhaps because there are multiple permutes with the
10586 // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the
10587 // permute mask out of the loop requires an extra register.
10588 //
10589 // As a compromise, we only emit discrete instructions if the shuffle can
10590 // be generated in 3 or fewer operations. When we have loop information
10591 // available, if this block is within a loop, we should avoid using vperm
10592 // for 3-operation perms and use a constant pool load instead.
10593 if (Cost < 3)
10594 return GeneratePerfectShuffle(PFEntry, LHS: V1, RHS: V2, DAG, dl);
10595 }
10596 }
10597
10598 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
10599 // vector that will get spilled to the constant pool.
10600 if (V2.isUndef()) V2 = V1;
10601
10602 return LowerVPERM(Op, DAG, PermMask, VT, V1, V2);
10603}
10604
10605SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
10606 ArrayRef<int> PermMask, EVT VT,
10607 SDValue V1, SDValue V2) const {
10608 unsigned Opcode = PPCISD::VPERM;
10609 EVT ValType = V1.getValueType();
10610 SDLoc dl(Op);
10611 bool NeedSwap = false;
10612 bool isLittleEndian = Subtarget.isLittleEndian();
10613 bool isPPC64 = Subtarget.isPPC64();
10614
10615 if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
10616 (V1->hasOneUse() || V2->hasOneUse())) {
10617 LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
10618 "XXPERM instead\n");
10619 Opcode = PPCISD::XXPERM;
10620
10621 // The second input to XXPERM is also an output so if the second input has
10622 // multiple uses then copying is necessary, as a result we want the
10623 // single-use operand to be used as the second input to prevent copying.
10624 if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
10625 (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
10626 std::swap(a&: V1, b&: V2);
10627 NeedSwap = !NeedSwap;
10628 }
10629 }
10630
10631 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
10632 // that it is in input element units, not in bytes. Convert now.
10633
10634 // For little endian, the order of the input vectors is reversed, and
10635 // the permutation mask is complemented with respect to 31. This is
10636 // necessary to produce proper semantics with the big-endian-based vperm
10637 // instruction.
10638 EVT EltVT = V1.getValueType().getVectorElementType();
10639 unsigned BytesPerElement = EltVT.getSizeInBits() / 8;
10640
10641 bool V1HasXXSWAPD = V1->getOperand(Num: 0)->getOpcode() == PPCISD::XXSWAPD;
10642 bool V2HasXXSWAPD = V2->getOperand(Num: 0)->getOpcode() == PPCISD::XXSWAPD;
10643
10644 /*
10645 Vectors will be appended like so: [ V1 | v2 ]
10646 XXSWAPD on V1:
10647 [ A | B | C | D ] -> [ C | D | A | B ]
10648 0-3 4-7 8-11 12-15 0-3 4-7 8-11 12-15
10649 i.e. index of A, B += 8, and index of C, D -= 8.
10650 XXSWAPD on V2:
10651 [ E | F | G | H ] -> [ G | H | E | F ]
10652 16-19 20-23 24-27 28-31 16-19 20-23 24-27 28-31
10653 i.e. index of E, F += 8, index of G, H -= 8
10654 Swap V1 and V2:
10655 [ V1 | V2 ] -> [ V2 | V1 ]
10656 0-15 16-31 0-15 16-31
10657 i.e. index of V1 += 16, index of V2 -= 16
10658 */
10659
10660 SmallVector<SDValue, 16> ResultMask;
10661 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
10662 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
10663
10664 if (V1HasXXSWAPD) {
10665 if (SrcElt < 8)
10666 SrcElt += 8;
10667 else if (SrcElt < 16)
10668 SrcElt -= 8;
10669 }
10670 if (V2HasXXSWAPD) {
10671 if (SrcElt > 23)
10672 SrcElt -= 8;
10673 else if (SrcElt > 15)
10674 SrcElt += 8;
10675 }
10676 if (NeedSwap) {
10677 if (SrcElt < 16)
10678 SrcElt += 16;
10679 else
10680 SrcElt -= 16;
10681 }
10682 for (unsigned j = 0; j != BytesPerElement; ++j)
10683 if (isLittleEndian)
10684 ResultMask.push_back(
10685 Elt: DAG.getConstant(Val: 31 - (SrcElt * BytesPerElement + j), DL: dl, VT: MVT::i32));
10686 else
10687 ResultMask.push_back(
10688 Elt: DAG.getConstant(Val: SrcElt * BytesPerElement + j, DL: dl, VT: MVT::i32));
10689 }
10690
10691 if (V1HasXXSWAPD) {
10692 dl = SDLoc(V1->getOperand(Num: 0));
10693 V1 = V1->getOperand(Num: 0)->getOperand(Num: 1);
10694 }
10695 if (V2HasXXSWAPD) {
10696 dl = SDLoc(V2->getOperand(Num: 0));
10697 V2 = V2->getOperand(Num: 0)->getOperand(Num: 1);
10698 }
10699
10700 if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10701 if (ValType != MVT::v2f64)
10702 V1 = DAG.getBitcast(VT: MVT::v2f64, V: V1);
10703 if (V2.getValueType() != MVT::v2f64)
10704 V2 = DAG.getBitcast(VT: MVT::v2f64, V: V2);
10705 }
10706
10707 ShufflesHandledWithVPERM++;
10708 SDValue VPermMask = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops: ResultMask);
10709 LLVM_DEBUG({
10710 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10711 if (Opcode == PPCISD::XXPERM) {
10712 dbgs() << "Emitting a XXPERM for the following shuffle:\n";
10713 } else {
10714 dbgs() << "Emitting a VPERM for the following shuffle:\n";
10715 }
10716 SVOp->dump();
10717 dbgs() << "With the following permute control vector:\n";
10718 VPermMask.dump();
10719 });
10720
10721 if (Opcode == PPCISD::XXPERM)
10722 VPermMask = DAG.getBitcast(VT: MVT::v4i32, V: VPermMask);
10723
10724 // Only need to place items backwards in LE,
10725 // the mask was properly calculated.
10726 if (isLittleEndian)
10727 std::swap(a&: V1, b&: V2);
10728
10729 SDValue VPERMNode =
10730 DAG.getNode(Opcode, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2, N3: VPermMask);
10731
10732 VPERMNode = DAG.getBitcast(VT: ValType, V: VPERMNode);
10733 return VPERMNode;
10734}
10735
10736/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
10737/// vector comparison. If it is, return true and fill in Opc/isDot with
10738/// information about the intrinsic.
10739static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
10740 bool &isDot, const PPCSubtarget &Subtarget) {
10741 unsigned IntrinsicID = Intrin.getConstantOperandVal(i: 0);
10742 CompareOpc = -1;
10743 isDot = false;
10744 switch (IntrinsicID) {
10745 default:
10746 return false;
10747 // Comparison predicates.
10748 case Intrinsic::ppc_altivec_vcmpbfp_p:
10749 CompareOpc = 966;
10750 isDot = true;
10751 break;
10752 case Intrinsic::ppc_altivec_vcmpeqfp_p:
10753 CompareOpc = 198;
10754 isDot = true;
10755 break;
10756 case Intrinsic::ppc_altivec_vcmpequb_p:
10757 CompareOpc = 6;
10758 isDot = true;
10759 break;
10760 case Intrinsic::ppc_altivec_vcmpequh_p:
10761 CompareOpc = 70;
10762 isDot = true;
10763 break;
10764 case Intrinsic::ppc_altivec_vcmpequw_p:
10765 CompareOpc = 134;
10766 isDot = true;
10767 break;
10768 case Intrinsic::ppc_altivec_vcmpequd_p:
10769 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10770 CompareOpc = 199;
10771 isDot = true;
10772 } else
10773 return false;
10774 break;
10775 case Intrinsic::ppc_altivec_vcmpneb_p:
10776 case Intrinsic::ppc_altivec_vcmpneh_p:
10777 case Intrinsic::ppc_altivec_vcmpnew_p:
10778 case Intrinsic::ppc_altivec_vcmpnezb_p:
10779 case Intrinsic::ppc_altivec_vcmpnezh_p:
10780 case Intrinsic::ppc_altivec_vcmpnezw_p:
10781 if (Subtarget.hasP9Altivec()) {
10782 switch (IntrinsicID) {
10783 default:
10784 llvm_unreachable("Unknown comparison intrinsic.");
10785 case Intrinsic::ppc_altivec_vcmpneb_p:
10786 CompareOpc = 7;
10787 break;
10788 case Intrinsic::ppc_altivec_vcmpneh_p:
10789 CompareOpc = 71;
10790 break;
10791 case Intrinsic::ppc_altivec_vcmpnew_p:
10792 CompareOpc = 135;
10793 break;
10794 case Intrinsic::ppc_altivec_vcmpnezb_p:
10795 CompareOpc = 263;
10796 break;
10797 case Intrinsic::ppc_altivec_vcmpnezh_p:
10798 CompareOpc = 327;
10799 break;
10800 case Intrinsic::ppc_altivec_vcmpnezw_p:
10801 CompareOpc = 391;
10802 break;
10803 }
10804 isDot = true;
10805 } else
10806 return false;
10807 break;
10808 case Intrinsic::ppc_altivec_vcmpgefp_p:
10809 CompareOpc = 454;
10810 isDot = true;
10811 break;
10812 case Intrinsic::ppc_altivec_vcmpgtfp_p:
10813 CompareOpc = 710;
10814 isDot = true;
10815 break;
10816 case Intrinsic::ppc_altivec_vcmpgtsb_p:
10817 CompareOpc = 774;
10818 isDot = true;
10819 break;
10820 case Intrinsic::ppc_altivec_vcmpgtsh_p:
10821 CompareOpc = 838;
10822 isDot = true;
10823 break;
10824 case Intrinsic::ppc_altivec_vcmpgtsw_p:
10825 CompareOpc = 902;
10826 isDot = true;
10827 break;
10828 case Intrinsic::ppc_altivec_vcmpgtsd_p:
10829 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10830 CompareOpc = 967;
10831 isDot = true;
10832 } else
10833 return false;
10834 break;
10835 case Intrinsic::ppc_altivec_vcmpgtub_p:
10836 CompareOpc = 518;
10837 isDot = true;
10838 break;
10839 case Intrinsic::ppc_altivec_vcmpgtuh_p:
10840 CompareOpc = 582;
10841 isDot = true;
10842 break;
10843 case Intrinsic::ppc_altivec_vcmpgtuw_p:
10844 CompareOpc = 646;
10845 isDot = true;
10846 break;
10847 case Intrinsic::ppc_altivec_vcmpgtud_p:
10848 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10849 CompareOpc = 711;
10850 isDot = true;
10851 } else
10852 return false;
10853 break;
10854
10855 case Intrinsic::ppc_altivec_vcmpequq:
10856 case Intrinsic::ppc_altivec_vcmpgtsq:
10857 case Intrinsic::ppc_altivec_vcmpgtuq:
10858 if (!Subtarget.isISA3_1())
10859 return false;
10860 switch (IntrinsicID) {
10861 default:
10862 llvm_unreachable("Unknown comparison intrinsic.");
10863 case Intrinsic::ppc_altivec_vcmpequq:
10864 CompareOpc = 455;
10865 break;
10866 case Intrinsic::ppc_altivec_vcmpgtsq:
10867 CompareOpc = 903;
10868 break;
10869 case Intrinsic::ppc_altivec_vcmpgtuq:
10870 CompareOpc = 647;
10871 break;
10872 }
10873 break;
10874
10875 // VSX predicate comparisons use the same infrastructure
10876 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10877 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10878 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10879 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10880 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10881 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10882 if (Subtarget.hasVSX()) {
10883 switch (IntrinsicID) {
10884 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10885 CompareOpc = 99;
10886 break;
10887 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10888 CompareOpc = 115;
10889 break;
10890 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10891 CompareOpc = 107;
10892 break;
10893 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10894 CompareOpc = 67;
10895 break;
10896 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10897 CompareOpc = 83;
10898 break;
10899 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10900 CompareOpc = 75;
10901 break;
10902 }
10903 isDot = true;
10904 } else
10905 return false;
10906 break;
10907
10908 // Normal Comparisons.
10909 case Intrinsic::ppc_altivec_vcmpbfp:
10910 CompareOpc = 966;
10911 break;
10912 case Intrinsic::ppc_altivec_vcmpeqfp:
10913 CompareOpc = 198;
10914 break;
10915 case Intrinsic::ppc_altivec_vcmpequb:
10916 CompareOpc = 6;
10917 break;
10918 case Intrinsic::ppc_altivec_vcmpequh:
10919 CompareOpc = 70;
10920 break;
10921 case Intrinsic::ppc_altivec_vcmpequw:
10922 CompareOpc = 134;
10923 break;
10924 case Intrinsic::ppc_altivec_vcmpequd:
10925 if (Subtarget.hasP8Altivec())
10926 CompareOpc = 199;
10927 else
10928 return false;
10929 break;
10930 case Intrinsic::ppc_altivec_vcmpneb:
10931 case Intrinsic::ppc_altivec_vcmpneh:
10932 case Intrinsic::ppc_altivec_vcmpnew:
10933 case Intrinsic::ppc_altivec_vcmpnezb:
10934 case Intrinsic::ppc_altivec_vcmpnezh:
10935 case Intrinsic::ppc_altivec_vcmpnezw:
10936 if (Subtarget.hasP9Altivec())
10937 switch (IntrinsicID) {
10938 default:
10939 llvm_unreachable("Unknown comparison intrinsic.");
10940 case Intrinsic::ppc_altivec_vcmpneb:
10941 CompareOpc = 7;
10942 break;
10943 case Intrinsic::ppc_altivec_vcmpneh:
10944 CompareOpc = 71;
10945 break;
10946 case Intrinsic::ppc_altivec_vcmpnew:
10947 CompareOpc = 135;
10948 break;
10949 case Intrinsic::ppc_altivec_vcmpnezb:
10950 CompareOpc = 263;
10951 break;
10952 case Intrinsic::ppc_altivec_vcmpnezh:
10953 CompareOpc = 327;
10954 break;
10955 case Intrinsic::ppc_altivec_vcmpnezw:
10956 CompareOpc = 391;
10957 break;
10958 }
10959 else
10960 return false;
10961 break;
10962 case Intrinsic::ppc_altivec_vcmpgefp:
10963 CompareOpc = 454;
10964 break;
10965 case Intrinsic::ppc_altivec_vcmpgtfp:
10966 CompareOpc = 710;
10967 break;
10968 case Intrinsic::ppc_altivec_vcmpgtsb:
10969 CompareOpc = 774;
10970 break;
10971 case Intrinsic::ppc_altivec_vcmpgtsh:
10972 CompareOpc = 838;
10973 break;
10974 case Intrinsic::ppc_altivec_vcmpgtsw:
10975 CompareOpc = 902;
10976 break;
10977 case Intrinsic::ppc_altivec_vcmpgtsd:
10978 if (Subtarget.hasP8Altivec())
10979 CompareOpc = 967;
10980 else
10981 return false;
10982 break;
10983 case Intrinsic::ppc_altivec_vcmpgtub:
10984 CompareOpc = 518;
10985 break;
10986 case Intrinsic::ppc_altivec_vcmpgtuh:
10987 CompareOpc = 582;
10988 break;
10989 case Intrinsic::ppc_altivec_vcmpgtuw:
10990 CompareOpc = 646;
10991 break;
10992 case Intrinsic::ppc_altivec_vcmpgtud:
10993 if (Subtarget.hasP8Altivec())
10994 CompareOpc = 711;
10995 else
10996 return false;
10997 break;
10998 case Intrinsic::ppc_altivec_vcmpequq_p:
10999 case Intrinsic::ppc_altivec_vcmpgtsq_p:
11000 case Intrinsic::ppc_altivec_vcmpgtuq_p:
11001 if (!Subtarget.isISA3_1())
11002 return false;
11003 switch (IntrinsicID) {
11004 default:
11005 llvm_unreachable("Unknown comparison intrinsic.");
11006 case Intrinsic::ppc_altivec_vcmpequq_p:
11007 CompareOpc = 455;
11008 break;
11009 case Intrinsic::ppc_altivec_vcmpgtsq_p:
11010 CompareOpc = 903;
11011 break;
11012 case Intrinsic::ppc_altivec_vcmpgtuq_p:
11013 CompareOpc = 647;
11014 break;
11015 }
11016 isDot = true;
11017 break;
11018 }
11019 return true;
11020}
11021
11022/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
11023/// lower, do it, otherwise return null.
11024SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
11025 SelectionDAG &DAG) const {
11026 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
11027
11028 SDLoc dl(Op);
11029 // Note: BCD instructions expect the immediate operand in vector form (v4i32),
11030 // but the builtin provides it as a scalar. To satisfy the instruction
11031 // encoding, we splat the scalar across all lanes using SPLAT_VECTOR.
11032 auto MapNodeWithSplatVector =
11033 [&](unsigned Opcode,
11034 std::initializer_list<SDValue> ExtraOps = {}) -> SDValue {
11035 SDValue SplatVal =
11036 DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: dl, VT: MVT::v4i32, Operand: Op.getOperand(i: 2));
11037
11038 SmallVector<SDValue, 4> Ops{SplatVal, Op.getOperand(i: 1)};
11039 Ops.append(in_start: ExtraOps.begin(), in_end: ExtraOps.end());
11040 return DAG.getNode(Opcode, DL: dl, VT: MVT::v16i8, Ops);
11041 };
11042
11043 switch (IntrinsicID) {
11044 case Intrinsic::thread_pointer:
11045 // Reads the thread pointer register, used for __builtin_thread_pointer.
11046 if (Subtarget.isPPC64())
11047 return DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
11048 return DAG.getRegister(Reg: PPC::R2, VT: MVT::i32);
11049
11050 case Intrinsic::ppc_rldimi: {
11051 assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");
11052 SDValue Src = Op.getOperand(i: 1);
11053 APInt Mask = Op.getConstantOperandAPInt(i: 4);
11054 if (Mask.isZero())
11055 return Op.getOperand(i: 2);
11056 if (Mask.isAllOnes())
11057 return DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src, N2: Op.getOperand(i: 3));
11058 uint64_t SH = Op.getConstantOperandVal(i: 3);
11059 unsigned MB = 0, ME = 0;
11060 if (!isRunOfOnes64(Val: Mask.getZExtValue(), MB, ME))
11061 report_fatal_error(reason: "invalid rldimi mask!");
11062 // rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
11063 if (ME < 63 - SH) {
11064 Src = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src,
11065 N2: DAG.getConstant(Val: ME + SH + 1, DL: dl, VT: MVT::i32));
11066 } else if (ME > 63 - SH) {
11067 Src = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src,
11068 N2: DAG.getConstant(Val: ME + SH - 63, DL: dl, VT: MVT::i32));
11069 }
11070 return SDValue(
11071 DAG.getMachineNode(Opcode: PPC::RLDIMI, dl, VT: MVT::i64,
11072 Ops: {Op.getOperand(i: 2), Src,
11073 DAG.getTargetConstant(Val: 63 - ME, DL: dl, VT: MVT::i32),
11074 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32)}),
11075 0);
11076 }
11077
11078 case Intrinsic::ppc_rlwimi: {
11079 APInt Mask = Op.getConstantOperandAPInt(i: 4);
11080 if (Mask.isZero())
11081 return Op.getOperand(i: 2);
11082 if (Mask.isAllOnes())
11083 return DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i32, N1: Op.getOperand(i: 1),
11084 N2: Op.getOperand(i: 3));
11085 unsigned MB = 0, ME = 0;
11086 if (!isRunOfOnes(Val: Mask.getZExtValue(), MB, ME))
11087 report_fatal_error(reason: "invalid rlwimi mask!");
11088 return SDValue(DAG.getMachineNode(
11089 Opcode: PPC::RLWIMI, dl, VT: MVT::i32,
11090 Ops: {Op.getOperand(i: 2), Op.getOperand(i: 1), Op.getOperand(i: 3),
11091 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32),
11092 DAG.getTargetConstant(Val: ME, DL: dl, VT: MVT::i32)}),
11093 0);
11094 }
11095
11096 case Intrinsic::ppc_bcdshift:
11097 return MapNodeWithSplatVector(PPCISD::BCDSHIFT, {Op.getOperand(i: 3)});
11098 case Intrinsic::ppc_bcdshiftround:
11099 return MapNodeWithSplatVector(PPCISD::BCDSHIFTROUND, {Op.getOperand(i: 3)});
11100 case Intrinsic::ppc_bcdtruncate:
11101 return MapNodeWithSplatVector(PPCISD::BCDTRUNC, {Op.getOperand(i: 3)});
11102 case Intrinsic::ppc_bcdunsignedtruncate:
11103 return MapNodeWithSplatVector(PPCISD::BCDUTRUNC);
11104 case Intrinsic::ppc_bcdunsignedshift:
11105 return MapNodeWithSplatVector(PPCISD::BCDUSHIFT);
11106
11107 case Intrinsic::ppc_rlwnm: {
11108 if (Op.getConstantOperandVal(i: 3) == 0)
11109 return DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32);
11110 unsigned MB = 0, ME = 0;
11111 if (!isRunOfOnes(Val: Op.getConstantOperandVal(i: 3), MB, ME))
11112 report_fatal_error(reason: "invalid rlwnm mask!");
11113 return SDValue(
11114 DAG.getMachineNode(Opcode: PPC::RLWNM, dl, VT: MVT::i32,
11115 Ops: {Op.getOperand(i: 1), Op.getOperand(i: 2),
11116 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32),
11117 DAG.getTargetConstant(Val: ME, DL: dl, VT: MVT::i32)}),
11118 0);
11119 }
11120
11121 case Intrinsic::ppc_mma_disassemble_acc: {
11122 if (Subtarget.isISAFuture()) {
11123 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11124 SDValue WideVec =
11125 SDValue(DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes,
11126 Ops: Op.getOperand(i: 1)),
11127 0);
11128 SmallVector<SDValue, 4> RetOps;
11129 SDValue Value = SDValue(WideVec.getNode(), 0);
11130 SDValue Value2 = SDValue(WideVec.getNode(), 1);
11131
11132 SDValue Extract;
11133 Extract = DAG.getNode(
11134 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11135 N1: Subtarget.isLittleEndian() ? Value2 : Value,
11136 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 1 : 0,
11137 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11138 RetOps.push_back(Elt: Extract);
11139 Extract = DAG.getNode(
11140 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11141 N1: Subtarget.isLittleEndian() ? Value2 : Value,
11142 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 0 : 1,
11143 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11144 RetOps.push_back(Elt: Extract);
11145 Extract = DAG.getNode(
11146 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11147 N1: Subtarget.isLittleEndian() ? Value : Value2,
11148 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 1 : 0,
11149 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11150 RetOps.push_back(Elt: Extract);
11151 Extract = DAG.getNode(
11152 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11153 N1: Subtarget.isLittleEndian() ? Value : Value2,
11154 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 0 : 1,
11155 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11156 RetOps.push_back(Elt: Extract);
11157 return DAG.getMergeValues(Ops: RetOps, dl);
11158 }
11159 [[fallthrough]];
11160 }
11161 case Intrinsic::ppc_vsx_disassemble_pair: {
11162 int NumVecs = 2;
11163 SDValue WideVec = Op.getOperand(i: 1);
11164 if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
11165 NumVecs = 4;
11166 WideVec = DAG.getNode(Opcode: PPCISD::XXMFACC, DL: dl, VT: MVT::v512i1, Operand: WideVec);
11167 }
11168 SmallVector<SDValue, 4> RetOps;
11169 for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
11170 SDValue Extract = DAG.getNode(
11171 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8, N1: WideVec,
11172 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
11173 : VecNo,
11174 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11175 RetOps.push_back(Elt: Extract);
11176 }
11177 return DAG.getMergeValues(Ops: RetOps, dl);
11178 }
11179
11180 case Intrinsic::ppc_mma_build_dmr: {
11181 SmallVector<SDValue, 8> Pairs;
11182 SmallVector<SDValue, 8> Chains;
11183 for (int i = 1; i < 9; i += 2) {
11184 SDValue Hi = Op.getOperand(i);
11185 SDValue Lo = Op.getOperand(i: i + 1);
11186 if (Hi->getOpcode() == ISD::LOAD)
11187 Chains.push_back(Elt: Hi.getValue(R: 1));
11188 if (Lo->getOpcode() == ISD::LOAD)
11189 Chains.push_back(Elt: Lo.getValue(R: 1));
11190 Pairs.push_back(
11191 Elt: DAG.getNode(Opcode: PPCISD::PAIR_BUILD, DL: dl, VT: MVT::v256i1, Ops: {Hi, Lo}));
11192 }
11193 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Chains);
11194 SDValue Value = DMFInsert1024(Pairs, dl: SDLoc(Op), DAG);
11195 return DAG.getMergeValues(Ops: {Value, TF}, dl);
11196 }
11197
11198 case Intrinsic::ppc_mma_dmxxextfdmr512: {
11199 assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
11200 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11201 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11202 "Specify P of 0 or 1 for lower or upper 512 bytes");
11203 unsigned HiLo = Idx->getSExtValue();
11204 unsigned Opcode;
11205 unsigned Subx;
11206 if (HiLo == 0) {
11207 Opcode = PPC::DMXXEXTFDMR512;
11208 Subx = PPC::sub_wacc_lo;
11209 } else {
11210 Opcode = PPC::DMXXEXTFDMR512_HI;
11211 Subx = PPC::sub_wacc_hi;
11212 }
11213 SDValue Subreg(
11214 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
11215 Op1: Op.getOperand(i: 1),
11216 Op2: DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32)),
11217 0);
11218 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11219 return SDValue(DAG.getMachineNode(Opcode, dl, ResultTys: ReturnTypes, Ops: Subreg), 0);
11220 }
11221
11222 case Intrinsic::ppc_mma_dmxxextfdmr256: {
11223 assert(Subtarget.isISAFuture() && "dmxxextfdmr256 requires ISA Future");
11224 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11225 assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
11226 "Specify a dmr row pair 0-3");
11227 unsigned IdxVal = Idx->getSExtValue();
11228 unsigned Subx;
11229 switch (IdxVal) {
11230 case 0:
11231 Subx = PPC::sub_dmrrowp0;
11232 break;
11233 case 1:
11234 Subx = PPC::sub_dmrrowp1;
11235 break;
11236 case 2:
11237 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
11238 break;
11239 case 3:
11240 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
11241 break;
11242 }
11243 SDValue Subreg(
11244 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v256i1,
11245 Op1: Op.getOperand(i: 1),
11246 Op2: DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32)),
11247 0);
11248 SDValue P = DAG.getTargetConstant(Val: IdxVal, DL: dl, VT: MVT::i32);
11249 return SDValue(
11250 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR256, dl, VT: MVT::v256i1, Ops: {Subreg, P}),
11251 0);
11252 }
11253
11254 case Intrinsic::ppc_mma_dmxxinstdmr512: {
11255 assert(Subtarget.isISAFuture() && "dmxxinstdmr512 requires ISA Future");
11256 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 4));
11257 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11258 "Specify P of 0 or 1 for lower or upper 512 bytes");
11259 unsigned HiLo = Idx->getSExtValue();
11260 unsigned Opcode;
11261 unsigned Subx;
11262 if (HiLo == 0) {
11263 Opcode = PPCISD::INST512;
11264 Subx = PPC::sub_wacc_lo;
11265 } else {
11266 Opcode = PPCISD::INST512HI;
11267 Subx = PPC::sub_wacc_hi;
11268 }
11269 SDValue Wacc = DAG.getNode(Opcode, DL: dl, VT: MVT::v512i1, N1: Op.getOperand(i: 2),
11270 N2: Op.getOperand(i: 3));
11271 SDValue SubReg = DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32);
11272 return SDValue(DAG.getMachineNode(Opcode: PPC::INSERT_SUBREG, dl, VT: MVT::v1024i1,
11273 Op1: Op.getOperand(i: 1), Op2: Wacc, Op3: SubReg),
11274 0);
11275 }
11276
11277 case Intrinsic::ppc_mma_dmxxinstdmr256: {
11278 assert(Subtarget.isISAFuture() && "dmxxinstdmr256 requires ISA Future");
11279 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
11280 assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
11281 "Specify a dmr row pair 0-3");
11282 unsigned IdxVal = Idx->getSExtValue();
11283 unsigned Subx;
11284 switch (IdxVal) {
11285 case 0:
11286 Subx = PPC::sub_dmrrowp0;
11287 break;
11288 case 1:
11289 Subx = PPC::sub_dmrrowp1;
11290 break;
11291 case 2:
11292 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
11293 break;
11294 case 3:
11295 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
11296 break;
11297 }
11298 SDValue SubReg = DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32);
11299 SDValue P = DAG.getTargetConstant(Val: IdxVal, DL: dl, VT: MVT::i32);
11300 SDValue DMRRowp =
11301 DAG.getNode(Opcode: PPCISD::INST256, DL: dl, VT: MVT::v256i1, N1: Op.getOperand(i: 2), N2: P);
11302 return SDValue(DAG.getMachineNode(Opcode: PPC::INSERT_SUBREG, dl, VT: MVT::v1024i1,
11303 Op1: Op.getOperand(i: 1), Op2: DMRRowp, Op3: SubReg),
11304 0);
11305 }
11306
11307 case Intrinsic::ppc_mma_xxmfacc:
11308 case Intrinsic::ppc_mma_xxmtacc: {
11309 // Allow pre-isa-future subtargets to lower as normal.
11310 if (!Subtarget.isISAFuture())
11311 return SDValue();
11312 // The intrinsics for xxmtacc and xxmfacc take one argument of
11313 // type v512i1, for future cpu the corresponding wacc instruction
11314 // dmxx[inst|extf]dmr512 is always generated for type v512i1, negating
11315 // the need to produce the xxm[t|f]acc.
11316 SDValue WideVec = Op.getOperand(i: 1);
11317 DAG.ReplaceAllUsesWith(From: Op, To: WideVec);
11318 return SDValue();
11319 }
11320
11321 case Intrinsic::ppc_unpack_longdouble: {
11322 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11323 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11324 "Argument of long double unpack must be 0 or 1!");
11325 return DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: dl, VT: MVT::f64, N1: Op.getOperand(i: 1),
11326 N2: DAG.getConstant(Val: !!(Idx->getSExtValue()), DL: dl,
11327 VT: Idx->getValueType(ResNo: 0)));
11328 }
11329
11330 case Intrinsic::ppc_compare_exp_lt:
11331 case Intrinsic::ppc_compare_exp_gt:
11332 case Intrinsic::ppc_compare_exp_eq:
11333 case Intrinsic::ppc_compare_exp_uo: {
11334 unsigned Pred;
11335 switch (IntrinsicID) {
11336 case Intrinsic::ppc_compare_exp_lt:
11337 Pred = PPC::PRED_LT;
11338 break;
11339 case Intrinsic::ppc_compare_exp_gt:
11340 Pred = PPC::PRED_GT;
11341 break;
11342 case Intrinsic::ppc_compare_exp_eq:
11343 Pred = PPC::PRED_EQ;
11344 break;
11345 case Intrinsic::ppc_compare_exp_uo:
11346 Pred = PPC::PRED_UN;
11347 break;
11348 }
11349 return SDValue(
11350 DAG.getMachineNode(
11351 Opcode: PPC::SELECT_CC_I4, dl, VT: MVT::i32,
11352 Ops: {SDValue(DAG.getMachineNode(Opcode: PPC::XSCMPEXPDP, dl, VT: MVT::i32,
11353 Op1: Op.getOperand(i: 1), Op2: Op.getOperand(i: 2)),
11354 0),
11355 DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32), DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32),
11356 DAG.getTargetConstant(Val: Pred, DL: dl, VT: MVT::i32)}),
11357 0);
11358 }
11359 case Intrinsic::ppc_test_data_class: {
11360 EVT OpVT = Op.getOperand(i: 1).getValueType();
11361 unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP
11362 : (OpVT == MVT::f64 ? PPC::XSTSTDCDP
11363 : PPC::XSTSTDCSP);
11364 // Lower __builtin_ppc_test_data_class(value, mask) to XSTSTDC* instruction.
11365 // The XSTSTDC* instructions test if a floating-point value matches any of
11366 // the data classes specified in the mask, setting CR field bits
11367 // accordingly. We need to extract the EQ bit (bit 2) from the CR field and
11368 // convert it to an integer result (1 if match, 0 if no match).
11369 //
11370 // Note: Operands are swapped because XSTSTDC* expects (mask, value) but the
11371 // intrinsic provides (value, mask) as Op.getOperand(1) and
11372 // Op.getOperand(2).
11373 SDValue TestDataClass =
11374 SDValue(DAG.getMachineNode(Opcode: CmprOpc, dl, VT: MVT::i32,
11375 Ops: {Op.getOperand(i: 2), Op.getOperand(i: 1)}),
11376 0);
11377 if (Subtarget.isISA3_1()) {
11378 // ISA 3.1+: Use SETBC instruction to directly convert CR bit to integer.
11379 // This is more efficient than the SELECT_CC approach used in earlier
11380 // ISAs.
11381 SDValue SubRegIdx = DAG.getTargetConstant(Val: PPC::sub_eq, DL: dl, VT: MVT::i32);
11382 SDValue CRBit =
11383 SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i1,
11384 Op1: TestDataClass, Op2: SubRegIdx),
11385 0);
11386
11387 return DAG.getNode(Opcode: PPCISD::SETBC, DL: dl, VT: MVT::i32, Operand: CRBit);
11388 }
11389
11390 // Pre-ISA 3.1: Use SELECT_CC to convert CR field to integer (1 or 0).
11391 return SDValue(
11392 DAG.getMachineNode(Opcode: PPC::SELECT_CC_I4, dl, VT: MVT::i32,
11393 Ops: {TestDataClass, DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32),
11394 DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32),
11395 DAG.getTargetConstant(Val: PPC::PRED_EQ, DL: dl, VT: MVT::i32)}),
11396 0);
11397 }
11398 case Intrinsic::ppc_fnmsub: {
11399 EVT VT = Op.getOperand(i: 1).getValueType();
11400 if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
11401 return DAG.getNode(
11402 Opcode: ISD::FNEG, DL: dl, VT,
11403 Operand: DAG.getNode(Opcode: ISD::FMA, DL: dl, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
11404 N3: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT, Operand: Op.getOperand(i: 3))));
11405 return DAG.getNode(Opcode: PPCISD::FNMSUB, DL: dl, VT, N1: Op.getOperand(i: 1),
11406 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11407 }
11408 case Intrinsic::ppc_convert_f128_to_ppcf128:
11409 case Intrinsic::ppc_convert_ppcf128_to_f128: {
11410 RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
11411 ? RTLIB::CONVERT_PPCF128_F128
11412 : RTLIB::CONVERT_F128_PPCF128;
11413 MakeLibCallOptions CallOptions;
11414 std::pair<SDValue, SDValue> Result =
11415 makeLibCall(DAG, LC, RetVT: Op.getValueType(), Ops: Op.getOperand(i: 1), CallOptions,
11416 dl, Chain: SDValue());
11417 return Result.first;
11418 }
11419 case Intrinsic::ppc_maxfe:
11420 case Intrinsic::ppc_maxfl:
11421 case Intrinsic::ppc_maxfs:
11422 case Intrinsic::ppc_minfe:
11423 case Intrinsic::ppc_minfl:
11424 case Intrinsic::ppc_minfs: {
11425 EVT VT = Op.getValueType();
11426 assert(
11427 all_of(Op->ops().drop_front(4),
11428 [VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&
11429 "ppc_[max|min]f[e|l|s] must have uniform type arguments");
11430 (void)VT;
11431 ISD::CondCode CC = ISD::SETGT;
11432 if (IntrinsicID == Intrinsic::ppc_minfe ||
11433 IntrinsicID == Intrinsic::ppc_minfl ||
11434 IntrinsicID == Intrinsic::ppc_minfs)
11435 CC = ISD::SETLT;
11436 unsigned I = Op.getNumOperands() - 2, Cnt = I;
11437 SDValue Res = Op.getOperand(i: I);
11438 for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {
11439 Res =
11440 DAG.getSelectCC(DL: dl, LHS: Res, RHS: Op.getOperand(i: I), True: Res, False: Op.getOperand(i: I), Cond: CC);
11441 }
11442 return Res;
11443 }
11444 }
11445
11446 // If this is a lowered altivec predicate compare, CompareOpc is set to the
11447 // opcode number of the comparison.
11448 int CompareOpc;
11449 bool isDot;
11450 if (!getVectorCompareInfo(Intrin: Op, CompareOpc, isDot, Subtarget))
11451 return SDValue(); // Don't custom lower most intrinsics.
11452
11453 // If this is a non-dot comparison, make the VCMP node and we are done.
11454 if (!isDot) {
11455 SDValue Tmp = DAG.getNode(Opcode: PPCISD::VCMP, DL: dl, VT: Op.getOperand(i: 2).getValueType(),
11456 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
11457 N3: DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32));
11458 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Tmp);
11459 }
11460
11461 // Create the PPCISD altivec 'dot' comparison node.
11462 SDValue Ops[] = {
11463 Op.getOperand(i: 2), // LHS
11464 Op.getOperand(i: 3), // RHS
11465 DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32)
11466 };
11467 EVT VTs[] = { Op.getOperand(i: 2).getValueType(), MVT::Glue };
11468 SDValue CompNode = DAG.getNode(Opcode: PPCISD::VCMP_rec, DL: dl, ResultTys: VTs, Ops);
11469
11470 // Unpack the result based on how the target uses it.
11471 unsigned BitNo; // Bit # of CR6.
11472 bool InvertBit; // Invert result?
11473 unsigned Bitx;
11474 unsigned SetOp;
11475 switch (Op.getConstantOperandVal(i: 1)) {
11476 default: // Can't happen, don't crash on invalid number though.
11477 case 0: // Return the value of the EQ bit of CR6.
11478 BitNo = 0;
11479 InvertBit = false;
11480 Bitx = PPC::sub_eq;
11481 SetOp = PPCISD::SETBC;
11482 break;
11483 case 1: // Return the inverted value of the EQ bit of CR6.
11484 BitNo = 0;
11485 InvertBit = true;
11486 Bitx = PPC::sub_eq;
11487 SetOp = PPCISD::SETBCR;
11488 break;
11489 case 2: // Return the value of the LT bit of CR6.
11490 BitNo = 2;
11491 InvertBit = false;
11492 Bitx = PPC::sub_lt;
11493 SetOp = PPCISD::SETBC;
11494 break;
11495 case 3: // Return the inverted value of the LT bit of CR6.
11496 BitNo = 2;
11497 InvertBit = true;
11498 Bitx = PPC::sub_lt;
11499 SetOp = PPCISD::SETBCR;
11500 break;
11501 }
11502
11503 SDValue GlueOp = CompNode.getValue(R: 1);
11504 if (Subtarget.isISA3_1()) {
11505 SDValue SubRegIdx = DAG.getTargetConstant(Val: Bitx, DL: dl, VT: MVT::i32);
11506 SDValue CR6Reg = DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32);
11507 SDValue CRBit =
11508 SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i1,
11509 Op1: CR6Reg, Op2: SubRegIdx, Op3: GlueOp),
11510 0);
11511 return DAG.getNode(Opcode: SetOp, DL: dl, VT: MVT::i32, Operand: CRBit);
11512 }
11513
11514 // Now that we have the comparison, emit a copy from the CR to a GPR.
11515 // This is flagged to the above dot comparison.
11516 SDValue Flags = DAG.getNode(Opcode: PPCISD::MFOCRF, DL: dl, VT: MVT::i32,
11517 N1: DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32), N2: GlueOp);
11518
11519 // Shift the bit into the low position.
11520 Flags = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32, N1: Flags,
11521 N2: DAG.getConstant(Val: 8 - (3 - BitNo), DL: dl, VT: MVT::i32));
11522 // Isolate the bit.
11523 Flags = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: Flags,
11524 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
11525
11526 // If we are supposed to, toggle the bit.
11527 if (InvertBit)
11528 Flags = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32, N1: Flags,
11529 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
11530 return Flags;
11531}
11532
11533SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11534 SelectionDAG &DAG) const {
11535 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
11536 // the beginning of the argument list.
11537 int ArgStart = isa<ConstantSDNode>(Val: Op.getOperand(i: 0)) ? 0 : 1;
11538 SDLoc DL(Op);
11539 switch (Op.getConstantOperandVal(i: ArgStart)) {
11540 case Intrinsic::ppc_cfence: {
11541 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
11542 SDValue Val = Op.getOperand(i: ArgStart + 1);
11543 EVT Ty = Val.getValueType();
11544 if (Ty == MVT::i128) {
11545 // FIXME: Testing one of two paired registers is sufficient to guarantee
11546 // ordering?
11547 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i64, Operand: Val);
11548 }
11549 unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE;
11550 return SDValue(
11551 DAG.getMachineNode(
11552 Opcode, dl: DL, VT: MVT::Other,
11553 Op1: DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: Subtarget.getScalarIntVT(), Operand: Val),
11554 Op2: Op.getOperand(i: 0)),
11555 0);
11556 }
11557 case Intrinsic::ppc_mma_disassemble_dmr: {
11558 return DAG.getStore(Chain: DAG.getEntryNode(), dl: DL, Val: Op.getOperand(i: ArgStart + 2),
11559 Ptr: Op.getOperand(i: ArgStart + 1), PtrInfo: MachinePointerInfo());
11560 }
11561 case Intrinsic::ppc_amo_stwat:
11562 case Intrinsic::ppc_amo_stdat: {
11563 SDLoc dl(Op);
11564 SDValue Chain = Op.getOperand(i: 0);
11565 SDValue Ptr = Op.getOperand(i: ArgStart + 1);
11566 SDValue Val = Op.getOperand(i: ArgStart + 2);
11567 SDValue FC = Op.getOperand(i: ArgStart + 3);
11568
11569 return DAG.getNode(Opcode: PPCISD::STAT, DL: dl, VT: MVT::Other, N1: Chain, N2: Val, N3: Ptr, N4: FC);
11570 }
11571 default:
11572 break;
11573 }
11574 return SDValue();
11575}
11576
11577// Lower scalar BSWAP64 to xxbrd.
11578SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
11579 SDLoc dl(Op);
11580 if (!Subtarget.isPPC64())
11581 return Op;
11582 // MTVSRDD
11583 Op = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: dl, VT: MVT::v2i64, N1: Op.getOperand(i: 0),
11584 N2: Op.getOperand(i: 0));
11585 // XXBRD
11586 Op = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v2i64, Operand: Op);
11587 // MFVSRD
11588 int VectorIndex = 0;
11589 if (Subtarget.isLittleEndian())
11590 VectorIndex = 1;
11591 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
11592 N2: DAG.getTargetConstant(Val: VectorIndex, DL: dl, VT: MVT::i32));
11593 return Op;
11594}
11595
11596// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
11597// compared to a value that is atomically loaded (atomic loads zero-extend).
11598SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11599 SelectionDAG &DAG) const {
11600 assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
11601 "Expecting an atomic compare-and-swap here.");
11602 SDLoc dl(Op);
11603 auto *AtomicNode = cast<AtomicSDNode>(Val: Op.getNode());
11604 EVT MemVT = AtomicNode->getMemoryVT();
11605 if (MemVT.getSizeInBits() >= 32)
11606 return Op;
11607
11608 SDValue CmpOp = Op.getOperand(i: 2);
11609 // If this is already correctly zero-extended, leave it alone.
11610 auto HighBits = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - MemVT.getSizeInBits());
11611 if (DAG.MaskedValueIsZero(Op: CmpOp, Mask: HighBits))
11612 return Op;
11613
11614 // Clear the high bits of the compare operand.
11615 unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
11616 SDValue NewCmpOp =
11617 DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: CmpOp,
11618 N2: DAG.getConstant(Val: MaskVal, DL: dl, VT: MVT::i32));
11619
11620 // Replace the existing compare operand with the properly zero-extended one.
11621 SmallVector<SDValue, 4> Ops;
11622 for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
11623 Ops.push_back(Elt: AtomicNode->getOperand(Num: i));
11624 Ops[2] = NewCmpOp;
11625 MachineMemOperand *MMO = AtomicNode->getMemOperand();
11626 SDVTList Tys = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
11627 auto NodeTy =
11628 (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
11629 return DAG.getMemIntrinsicNode(Opcode: NodeTy, dl, VTList: Tys, Ops, MemVT, MMO);
11630}
11631
11632SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
11633 SelectionDAG &DAG) const {
11634 AtomicSDNode *N = cast<AtomicSDNode>(Val: Op.getNode());
11635 EVT MemVT = N->getMemoryVT();
11636 assert(MemVT.getSimpleVT() == MVT::i128 &&
11637 "Expect quadword atomic operations");
11638 SDLoc dl(N);
11639 unsigned Opc = N->getOpcode();
11640 switch (Opc) {
11641 case ISD::ATOMIC_LOAD: {
11642 // Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
11643 // lowered to ppc instructions by pattern matching instruction selector.
11644 SDVTList Tys = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other);
11645 SmallVector<SDValue, 4> Ops{
11646 N->getOperand(Num: 0),
11647 DAG.getConstant(Val: Intrinsic::ppc_atomic_load_i128, DL: dl, VT: MVT::i32)};
11648 for (int I = 1, E = N->getNumOperands(); I < E; ++I)
11649 Ops.push_back(Elt: N->getOperand(Num: I));
11650 SDValue LoadedVal = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl, VTList: Tys,
11651 Ops, MemVT, MMO: N->getMemOperand());
11652 SDValue ValLo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i128, Operand: LoadedVal);
11653 SDValue ValHi =
11654 DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i128, Operand: LoadedVal.getValue(R: 1));
11655 ValHi = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i128, N1: ValHi,
11656 N2: DAG.getConstant(Val: 64, DL: dl, VT: MVT::i32));
11657 SDValue Val =
11658 DAG.getNode(Opcode: ISD::OR, DL: dl, ResultTys: {MVT::i128, MVT::Other}, Ops: {ValLo, ValHi});
11659 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, ResultTys: {MVT::i128, MVT::Other},
11660 Ops: {Val, LoadedVal.getValue(R: 2)});
11661 }
11662 case ISD::ATOMIC_STORE: {
11663 // Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
11664 // lowered to ppc instructions by pattern matching instruction selector.
11665 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
11666 SmallVector<SDValue, 4> Ops{
11667 N->getOperand(Num: 0),
11668 DAG.getConstant(Val: Intrinsic::ppc_atomic_store_i128, DL: dl, VT: MVT::i32)};
11669 SDValue Val = N->getOperand(Num: 1);
11670 SDValue ValLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i64, Operand: Val);
11671 SDValue ValHi = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i128, N1: Val,
11672 N2: DAG.getConstant(Val: 64, DL: dl, VT: MVT::i32));
11673 ValHi = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i64, Operand: ValHi);
11674 Ops.push_back(Elt: ValLo);
11675 Ops.push_back(Elt: ValHi);
11676 Ops.push_back(Elt: N->getOperand(Num: 2));
11677 return DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops, MemVT,
11678 MMO: N->getMemOperand());
11679 }
11680 default:
11681 llvm_unreachable("Unexpected atomic opcode");
11682 }
11683}
11684
11685static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl,
11686 SelectionDAG &DAG,
11687 const PPCSubtarget &Subtarget) {
11688 assert(Mask <= fcAllFlags && "Invalid fp_class flags!");
11689
11690 enum DataClassMask {
11691 DC_NAN = 1 << 6,
11692 DC_NEG_INF = 1 << 4,
11693 DC_POS_INF = 1 << 5,
11694 DC_NEG_ZERO = 1 << 2,
11695 DC_POS_ZERO = 1 << 3,
11696 DC_NEG_SUBNORM = 1,
11697 DC_POS_SUBNORM = 1 << 1,
11698 };
11699
11700 EVT VT = Op.getValueType();
11701
11702 unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP
11703 : VT == MVT::f64 ? PPC::XSTSTDCDP
11704 : PPC::XSTSTDCSP;
11705
11706 if (Mask == fcAllFlags)
11707 return DAG.getBoolConstant(V: true, DL: Dl, VT: MVT::i1, OpVT: VT);
11708 if (Mask == 0)
11709 return DAG.getBoolConstant(V: false, DL: Dl, VT: MVT::i1, OpVT: VT);
11710
11711 // When it's cheaper or necessary to test reverse flags.
11712 if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) {
11713 SDValue Rev = getDataClassTest(Op, Mask: ~Mask, Dl, DAG, Subtarget);
11714 return DAG.getNOT(DL: Dl, Val: Rev, VT: MVT::i1);
11715 }
11716
11717 // Power doesn't support testing whether a value is 'normal'. Test the rest
11718 // first, and test if it's 'not not-normal' with expected sign.
11719 if (Mask & fcNormal) {
11720 SDValue Rev(DAG.getMachineNode(
11721 Opcode: TestOp, dl: Dl, VT: MVT::i32,
11722 Op1: DAG.getTargetConstant(Val: DC_NAN | DC_NEG_INF | DC_POS_INF |
11723 DC_NEG_ZERO | DC_POS_ZERO |
11724 DC_NEG_SUBNORM | DC_POS_SUBNORM,
11725 DL: Dl, VT: MVT::i32),
11726 Op2: Op),
11727 0);
11728 // Sign are stored in CR bit 0, result are in CR bit 2.
11729 SDValue Sign(
11730 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1, Op1: Rev,
11731 Op2: DAG.getTargetConstant(Val: PPC::sub_lt, DL: Dl, VT: MVT::i32)),
11732 0);
11733 SDValue Normal(DAG.getNOT(
11734 DL: Dl,
11735 Val: SDValue(DAG.getMachineNode(
11736 Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1, Op1: Rev,
11737 Op2: DAG.getTargetConstant(Val: PPC::sub_eq, DL: Dl, VT: MVT::i32)),
11738 0),
11739 VT: MVT::i1));
11740 if (Mask & fcPosNormal)
11741 Sign = DAG.getNOT(DL: Dl, Val: Sign, VT: MVT::i1);
11742 SDValue Result = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i1, N1: Sign, N2: Normal);
11743 if (Mask == fcPosNormal || Mask == fcNegNormal)
11744 return Result;
11745
11746 return DAG.getNode(
11747 Opcode: ISD::OR, DL: Dl, VT: MVT::i1,
11748 N1: getDataClassTest(Op, Mask: Mask & ~fcNormal, Dl, DAG, Subtarget), N2: Result);
11749 }
11750
11751 // The instruction doesn't differentiate between signaling or quiet NaN. Test
11752 // the rest first, and test if it 'is NaN and is signaling/quiet'.
11753 if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) {
11754 bool IsQuiet = Mask & fcQNan;
11755 SDValue NanCheck = getDataClassTest(Op, Mask: fcNan, Dl, DAG, Subtarget);
11756
11757 // Quietness is determined by the first bit in fraction field.
11758 uint64_t QuietMask = 0;
11759 SDValue HighWord;
11760 if (VT == MVT::f128) {
11761 HighWord = DAG.getNode(
11762 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: Dl, VT: MVT::i32, N1: DAG.getBitcast(VT: MVT::v4i32, V: Op),
11763 N2: DAG.getVectorIdxConstant(Val: Subtarget.isLittleEndian() ? 3 : 0, DL: Dl));
11764 QuietMask = 0x8000;
11765 } else if (VT == MVT::f64) {
11766 if (Subtarget.isPPC64()) {
11767 HighWord = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: Dl, VT: MVT::i32,
11768 N1: DAG.getBitcast(VT: MVT::i64, V: Op),
11769 N2: DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32));
11770 } else {
11771 SDValue Vec = DAG.getBitcast(
11772 VT: MVT::v4i32, V: DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: Dl, VT: MVT::v2f64, Operand: Op));
11773 HighWord = DAG.getNode(
11774 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: Dl, VT: MVT::i32, N1: Vec,
11775 N2: DAG.getVectorIdxConstant(Val: Subtarget.isLittleEndian() ? 1 : 0, DL: Dl));
11776 }
11777 QuietMask = 0x80000;
11778 } else if (VT == MVT::f32) {
11779 HighWord = DAG.getBitcast(VT: MVT::i32, V: Op);
11780 QuietMask = 0x400000;
11781 }
11782 SDValue NanRes = DAG.getSetCC(
11783 DL: Dl, VT: MVT::i1,
11784 LHS: DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32, N1: HighWord,
11785 N2: DAG.getConstant(Val: QuietMask, DL: Dl, VT: MVT::i32)),
11786 RHS: DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i32), Cond: IsQuiet ? ISD::SETNE : ISD::SETEQ);
11787 NanRes = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i1, N1: NanCheck, N2: NanRes);
11788 if (Mask == fcQNan || Mask == fcSNan)
11789 return NanRes;
11790
11791 return DAG.getNode(Opcode: ISD::OR, DL: Dl, VT: MVT::i1,
11792 N1: getDataClassTest(Op, Mask: Mask & ~fcNan, Dl, DAG, Subtarget),
11793 N2: NanRes);
11794 }
11795
11796 unsigned NativeMask = 0;
11797 if ((Mask & fcNan) == fcNan)
11798 NativeMask |= DC_NAN;
11799 if (Mask & fcNegInf)
11800 NativeMask |= DC_NEG_INF;
11801 if (Mask & fcPosInf)
11802 NativeMask |= DC_POS_INF;
11803 if (Mask & fcNegZero)
11804 NativeMask |= DC_NEG_ZERO;
11805 if (Mask & fcPosZero)
11806 NativeMask |= DC_POS_ZERO;
11807 if (Mask & fcNegSubnormal)
11808 NativeMask |= DC_NEG_SUBNORM;
11809 if (Mask & fcPosSubnormal)
11810 NativeMask |= DC_POS_SUBNORM;
11811 return SDValue(
11812 DAG.getMachineNode(
11813 Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1,
11814 Op1: SDValue(DAG.getMachineNode(
11815 Opcode: TestOp, dl: Dl, VT: MVT::i32,
11816 Op1: DAG.getTargetConstant(Val: NativeMask, DL: Dl, VT: MVT::i32), Op2: Op),
11817 0),
11818 Op2: DAG.getTargetConstant(Val: PPC::sub_eq, DL: Dl, VT: MVT::i32)),
11819 0);
11820}
11821
11822SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
11823 SelectionDAG &DAG) const {
11824 assert(Subtarget.hasP9Vector() && "Test data class requires Power9");
11825 SDValue LHS = Op.getOperand(i: 0);
11826 uint64_t RHSC = Op.getConstantOperandVal(i: 1);
11827 SDLoc Dl(Op);
11828 FPClassTest Category = static_cast<FPClassTest>(RHSC);
11829 if (LHS.getValueType() == MVT::ppcf128) {
11830 // The higher part determines the value class.
11831 LHS = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: Dl, VT: MVT::f64, N1: LHS,
11832 N2: DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32));
11833 }
11834
11835 return getDataClassTest(Op: LHS, Mask: Category, Dl, DAG, Subtarget);
11836}
11837
11838// Adjust the length value for a load/store with length to account for the
11839// instructions requiring a left justified length, and for non-byte element
11840// types requiring scaling by element size.
11841static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left,
11842 SelectionDAG &DAG) {
11843 SDLoc dl(Val);
11844 EVT VT = Val->getValueType(ResNo: 0);
11845 unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0;
11846 unsigned TypeAdj = llvm::countr_zero<uint32_t>(Val: Bits / 8);
11847 SDValue SHLAmt = DAG.getConstant(Val: LeftAdj + TypeAdj, DL: dl, VT);
11848 return DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: Val, N2: SHLAmt);
11849}
11850
11851SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const {
11852 auto VPLD = cast<VPLoadSDNode>(Val&: Op);
11853 bool Future = Subtarget.isISAFuture();
11854 SDLoc dl(Op);
11855 assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) &&
11856 "Mask predication not supported");
11857 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11858 SDValue Len = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: PtrVT, Operand: VPLD->getOperand(Num: 4));
11859 unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl;
11860 unsigned EltBits = Op->getValueType(ResNo: 0).getScalarType().getSizeInBits();
11861 Len = AdjustLength(Val: Len, Bits: EltBits, Left: !Future, DAG);
11862 SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32),
11863 VPLD->getOperand(Num: 1), Len};
11864 SDVTList Tys = DAG.getVTList(VT1: Op->getValueType(ResNo: 0), VT2: MVT::Other);
11865 SDValue VPL =
11866 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl, VTList: Tys, Ops,
11867 MemVT: VPLD->getMemoryVT(), MMO: VPLD->getMemOperand());
11868 return VPL;
11869}
11870
11871SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const {
11872 auto VPST = cast<VPStoreSDNode>(Val&: Op);
11873 assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) &&
11874 "Mask predication not supported");
11875 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11876 SDLoc dl(Op);
11877 SDValue Len = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: PtrVT, Operand: VPST->getOperand(Num: 5));
11878 unsigned EltBits =
11879 Op->getOperand(Num: 1).getValueType().getScalarType().getSizeInBits();
11880 bool Future = Subtarget.isISAFuture();
11881 unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl;
11882 Len = AdjustLength(Val: Len, Bits: EltBits, Left: !Future, DAG);
11883 SDValue Ops[] = {
11884 VPST->getChain(), DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32),
11885 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: VPST->getOperand(Num: 1)),
11886 VPST->getOperand(Num: 2), Len};
11887 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
11888 SDValue VPS =
11889 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops,
11890 MemVT: VPST->getMemoryVT(), MMO: VPST->getMemOperand());
11891 return VPS;
11892}
11893
11894SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
11895 SelectionDAG &DAG) const {
11896 SDLoc dl(Op);
11897
11898 MachineFunction &MF = DAG.getMachineFunction();
11899 SDValue Op0 = Op.getOperand(i: 0);
11900 EVT ValVT = Op0.getValueType();
11901 unsigned EltSize = Op.getValueType().getScalarSizeInBits();
11902 if (isa<ConstantSDNode>(Val: Op0) && EltSize <= 32) {
11903 int64_t IntVal = Op.getConstantOperandVal(i: 0);
11904 if (IntVal >= -16 && IntVal <= 15)
11905 return getCanonicalConstSplat(Val: IntVal, SplatSize: EltSize / 8, VT: Op.getValueType(), DAG,
11906 dl);
11907 }
11908
11909 ReuseLoadInfo RLI;
11910 if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() &&
11911 Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD &&
11912 Op0.getValueType() == MVT::i32 && Op0.hasOneUse() &&
11913 canReuseLoadAddress(Op: Op0, MemVT: MVT::i32, RLI, DAG, ET: ISD::NON_EXTLOAD)) {
11914
11915 MachineMemOperand *MMO =
11916 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
11917 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
11918 SDValue Ops[] = {RLI.Chain, RLI.Ptr, DAG.getValueType(Op.getValueType())};
11919 SDValue Bits = DAG.getMemIntrinsicNode(
11920 Opcode: PPCISD::LD_SPLAT, dl, VTList: DAG.getVTList(VT1: MVT::v4i32, VT2: MVT::Other), Ops,
11921 MemVT: MVT::i32, MMO);
11922 if (RLI.ResChain)
11923 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
11924 return Bits.getValue(R: 0);
11925 }
11926
11927 // Create a stack slot that is 16-byte aligned.
11928 MachineFrameInfo &MFI = MF.getFrameInfo();
11929 int FrameIdx = MFI.CreateStackObject(Size: 16, Alignment: Align(16), isSpillSlot: false);
11930 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11931 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
11932
11933 SDValue Val = Op0;
11934 // P10 hardware store forwarding requires that a single store contains all
11935 // the data for the load. P10 is able to merge a pair of adjacent stores. Try
11936 // to avoid load hit store on P10 when running binaries compiled for older
11937 // processors by generating two mergeable scalar stores to forward with the
11938 // vector load.
11939 if (!DisableP10StoreForward && Subtarget.isPPC64() &&
11940 !Subtarget.isLittleEndian() && ValVT.isInteger() &&
11941 ValVT.getSizeInBits() <= 64) {
11942 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: Val);
11943 EVT ShiftAmountTy = getShiftAmountTy(LHSTy: MVT::i64, DL: DAG.getDataLayout());
11944 SDValue ShiftBy = DAG.getConstant(
11945 Val: 64 - Op.getValueType().getScalarSizeInBits(), DL: dl, VT: ShiftAmountTy);
11946 Val = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: Val, N2: ShiftBy);
11947 SDValue Plus8 =
11948 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FIdx, N2: DAG.getConstant(Val: 8, DL: dl, VT: PtrVT));
11949 SDValue Store2 =
11950 DAG.getStore(Chain: DAG.getEntryNode(), dl, Val, Ptr: Plus8, PtrInfo: MachinePointerInfo());
11951 SDValue Store = DAG.getStore(Chain: Store2, dl, Val, Ptr: FIdx, PtrInfo: MachinePointerInfo());
11952 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: Store, Ptr: FIdx,
11953 PtrInfo: MachinePointerInfo());
11954 }
11955
11956 // Store the input value into Value#0 of the stack slot.
11957 SDValue Store =
11958 DAG.getStore(Chain: DAG.getEntryNode(), dl, Val, Ptr: FIdx, PtrInfo: MachinePointerInfo());
11959 // Load it out.
11960 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: Store, Ptr: FIdx, PtrInfo: MachinePointerInfo());
11961}
11962
11963SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
11964 SelectionDAG &DAG) const {
11965 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
11966 "Should only be called for ISD::INSERT_VECTOR_ELT");
11967
11968 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11969
11970 EVT VT = Op.getValueType();
11971 SDLoc dl(Op);
11972 SDValue V1 = Op.getOperand(i: 0);
11973 SDValue V2 = Op.getOperand(i: 1);
11974
11975 if (VT == MVT::v2f64 && C)
11976 return Op;
11977
11978 if (Subtarget.hasP9Vector()) {
11979 // A f32 load feeding into a v4f32 insert_vector_elt is handled in this way
11980 // because on P10, it allows this specific insert_vector_elt load pattern to
11981 // utilize the refactored load and store infrastructure in order to exploit
11982 // prefixed loads.
11983 // On targets with inexpensive direct moves (Power9 and up), a
11984 // (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer
11985 // load since a single precision load will involve conversion to double
11986 // precision on the load followed by another conversion to single precision.
11987 if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&
11988 (isa<LoadSDNode>(Val: V2))) {
11989 SDValue BitcastVector = DAG.getBitcast(VT: MVT::v4i32, V: V1);
11990 SDValue BitcastLoad = DAG.getBitcast(VT: MVT::i32, V: V2);
11991 SDValue InsVecElt =
11992 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: MVT::v4i32, N1: BitcastVector,
11993 N2: BitcastLoad, N3: Op.getOperand(i: 2));
11994 return DAG.getBitcast(VT: MVT::v4f32, V: InsVecElt);
11995 }
11996 }
11997
11998 if (Subtarget.isISA3_1()) {
11999 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
12000 return SDValue();
12001 // On P10, we have legal lowering for constant and variable indices for
12002 // all vectors.
12003 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12004 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
12005 return Op;
12006 }
12007
12008 // Before P10, we have legal lowering for constant indices but not for
12009 // variable ones.
12010 if (!C)
12011 return SDValue();
12012
12013 // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
12014 if (VT == MVT::v8i16 || VT == MVT::v16i8) {
12015 SDValue Mtvsrz = DAG.getNode(Opcode: PPCISD::MTVSRZ, DL: dl, VT, Operand: V2);
12016 unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
12017 unsigned InsertAtElement = C->getZExtValue();
12018 unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
12019 if (Subtarget.isLittleEndian()) {
12020 InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
12021 }
12022 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT, N1: V1, N2: Mtvsrz,
12023 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
12024 }
12025 return Op;
12026}
12027
12028SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
12029 SelectionDAG &DAG) const {
12030 SDLoc dl(Op);
12031 LoadSDNode *LN = cast<LoadSDNode>(Val: Op.getNode());
12032 SDValue LoadChain = LN->getChain();
12033 SDValue BasePtr = LN->getBasePtr();
12034 EVT VT = Op.getValueType();
12035 bool IsV1024i1 = VT == MVT::v1024i1;
12036 bool IsV2048i1 = VT == MVT::v2048i1;
12037
12038 // The types v1024i1 and v2048i1 are used for Dense Math dmr registers and
12039 // Dense Math dmr pair registers, respectively.
12040 assert((IsV1024i1 || IsV2048i1) && "Unsupported type.");
12041 (void)IsV2048i1;
12042 assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
12043 "Dense Math support required.");
12044 assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
12045
12046 SmallVector<SDValue, 8> Loads;
12047 SmallVector<SDValue, 8> LoadChains;
12048
12049 SDValue IntrinID = DAG.getConstant(Val: Intrinsic::ppc_vsx_lxvp, DL: dl, VT: MVT::i32);
12050 SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr};
12051 MachineMemOperand *MMO = LN->getMemOperand();
12052 unsigned NumVecs = VT.getSizeInBits() / 256;
12053 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12054 MachineMemOperand *NewMMO =
12055 DAG.getMachineFunction().getMachineMemOperand(MMO, Offset: Idx * 32, Size: 32);
12056 if (Idx > 0) {
12057 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12058 N2: DAG.getConstant(Val: 32, DL: dl, VT: BasePtr.getValueType()));
12059 LoadOps[2] = BasePtr;
12060 }
12061 SDValue Ld = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
12062 VTList: DAG.getVTList(VT1: MVT::v256i1, VT2: MVT::Other),
12063 Ops: LoadOps, MemVT: MVT::v256i1, MMO: NewMMO);
12064 LoadChains.push_back(Elt: Ld.getValue(R: 1));
12065 Loads.push_back(Elt: Ld);
12066 }
12067
12068 if (Subtarget.isLittleEndian()) {
12069 std::reverse(first: Loads.begin(), last: Loads.end());
12070 std::reverse(first: LoadChains.begin(), last: LoadChains.end());
12071 }
12072
12073 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: LoadChains);
12074 SDValue Value = DMFInsert1024(Pairs: Loads, dl, DAG);
12075
12076 if (IsV1024i1) {
12077 return DAG.getMergeValues(Ops: {Value, TF}, dl);
12078 }
12079
12080 // Handle Loads for V2048i1 which represents a dmr pair.
12081 SmallVector<SDValue, 4> MoreLoads{Loads[4], Loads[5], Loads[6], Loads[7]};
12082 SDValue Dmr1Value = DMFInsert1024(Pairs: MoreLoads, dl, DAG);
12083
12084 SDValue Dmr0Sub = DAG.getTargetConstant(Val: PPC::sub_dmr0, DL: dl, VT: MVT::i32);
12085 SDValue Dmr1Sub = DAG.getTargetConstant(Val: PPC::sub_dmr1, DL: dl, VT: MVT::i32);
12086
12087 SDValue DmrPRC = DAG.getTargetConstant(Val: PPC::DMRpRCRegClassID, DL: dl, VT: MVT::i32);
12088 const SDValue DmrPOps[] = {DmrPRC, Value, Dmr0Sub, Dmr1Value, Dmr1Sub};
12089
12090 SDValue DmrPValue = SDValue(
12091 DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v2048i1, Ops: DmrPOps), 0);
12092
12093 return DAG.getMergeValues(Ops: {DmrPValue, TF}, dl);
12094}
12095
12096SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
12097 const SDLoc &dl,
12098 SelectionDAG &DAG) const {
12099 SDValue Lo =
12100 DAG.getNode(Opcode: PPCISD::INST512, DL: dl, VT: MVT::v512i1, N1: Pairs[0], N2: Pairs[1]);
12101 SDValue LoSub = DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32);
12102 SDValue Hi =
12103 DAG.getNode(Opcode: PPCISD::INST512HI, DL: dl, VT: MVT::v512i1, N1: Pairs[2], N2: Pairs[3]);
12104 SDValue HiSub = DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32);
12105 SDValue RC = DAG.getTargetConstant(Val: PPC::DMRRCRegClassID, DL: dl, VT: MVT::i32);
12106
12107 return SDValue(DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v1024i1,
12108 Ops: {RC, Lo, LoSub, Hi, HiSub}),
12109 0);
12110}
12111
12112SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
12113 SelectionDAG &DAG) const {
12114 SDLoc dl(Op);
12115 LoadSDNode *LN = cast<LoadSDNode>(Val: Op.getNode());
12116 SDValue LoadChain = LN->getChain();
12117 SDValue BasePtr = LN->getBasePtr();
12118 EVT VT = Op.getValueType();
12119
12120 if (VT == MVT::v1024i1 || VT == MVT::v2048i1)
12121 return LowerDMFVectorLoad(Op, DAG);
12122
12123 if (VT != MVT::v256i1 && VT != MVT::v512i1)
12124 return Op;
12125
12126 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
12127 // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
12128 // 2 or 4 vsx registers.
12129 assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
12130 "Type unsupported without MMA");
12131 assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
12132 "Type unsupported without paired vector support");
12133 Align Alignment = LN->getAlign();
12134 SmallVector<SDValue, 4> Loads;
12135 SmallVector<SDValue, 4> LoadChains;
12136 unsigned NumVecs = VT.getSizeInBits() / 128;
12137 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12138 SDValue Load =
12139 DAG.getLoad(VT: MVT::v16i8, dl, Chain: LoadChain, Ptr: BasePtr,
12140 PtrInfo: LN->getPointerInfo().getWithOffset(O: Idx * 16),
12141 Alignment: commonAlignment(A: Alignment, Offset: Idx * 16),
12142 MMOFlags: LN->getMemOperand()->getFlags(), AAInfo: LN->getAAInfo());
12143 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12144 N2: DAG.getConstant(Val: 16, DL: dl, VT: BasePtr.getValueType()));
12145 Loads.push_back(Elt: Load);
12146 LoadChains.push_back(Elt: Load.getValue(R: 1));
12147 }
12148 if (Subtarget.isLittleEndian()) {
12149 std::reverse(first: Loads.begin(), last: Loads.end());
12150 std::reverse(first: LoadChains.begin(), last: LoadChains.end());
12151 }
12152 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: LoadChains);
12153 SDValue Value =
12154 DAG.getNode(Opcode: VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
12155 DL: dl, VT, Ops: Loads);
12156 SDValue RetOps[] = {Value, TF};
12157 return DAG.getMergeValues(Ops: RetOps, dl);
12158}
12159
12160SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op,
12161 SelectionDAG &DAG) const {
12162
12163 SDLoc dl(Op);
12164 StoreSDNode *SN = cast<StoreSDNode>(Val: Op.getNode());
12165 SDValue StoreChain = SN->getChain();
12166 SDValue BasePtr = SN->getBasePtr();
12167 SmallVector<SDValue, 8> Values;
12168 SmallVector<SDValue, 8> Stores;
12169 EVT VT = SN->getValue().getValueType();
12170 bool IsV1024i1 = VT == MVT::v1024i1;
12171 bool IsV2048i1 = VT == MVT::v2048i1;
12172
12173 // The types v1024i1 and v2048i1 are used for Dense Math dmr registers and
12174 // Dense Math dmr pair registers, respectively.
12175 assert((IsV1024i1 || IsV2048i1) && "Unsupported type.");
12176 (void)IsV2048i1;
12177 assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
12178 "Dense Math support required.");
12179 assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
12180
12181 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
12182 if (IsV1024i1) {
12183 SDValue Lo(DAG.getMachineNode(
12184 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
12185 Op1: Op.getOperand(i: 1),
12186 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12187 0);
12188 SDValue Hi(DAG.getMachineNode(
12189 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
12190 Op1: Op.getOperand(i: 1),
12191 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12192 0);
12193 MachineSDNode *ExtNode =
12194 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Lo);
12195 Values.push_back(Elt: SDValue(ExtNode, 0));
12196 Values.push_back(Elt: SDValue(ExtNode, 1));
12197 ExtNode = DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Hi);
12198 Values.push_back(Elt: SDValue(ExtNode, 0));
12199 Values.push_back(Elt: SDValue(ExtNode, 1));
12200 } else {
12201 // This corresponds to v2048i1 which represents a dmr pair.
12202 SDValue Dmr0(
12203 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v1024i1,
12204 Op1: Op.getOperand(i: 1),
12205 Op2: DAG.getTargetConstant(Val: PPC::sub_dmr0, DL: dl, VT: MVT::i32)),
12206 0);
12207
12208 SDValue Dmr1(
12209 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v1024i1,
12210 Op1: Op.getOperand(i: 1),
12211 Op2: DAG.getTargetConstant(Val: PPC::sub_dmr1, DL: dl, VT: MVT::i32)),
12212 0);
12213
12214 SDValue Dmr0Lo(DAG.getMachineNode(
12215 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr0,
12216 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12217 0);
12218
12219 SDValue Dmr0Hi(DAG.getMachineNode(
12220 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr0,
12221 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12222 0);
12223
12224 SDValue Dmr1Lo(DAG.getMachineNode(
12225 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr1,
12226 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12227 0);
12228
12229 SDValue Dmr1Hi(DAG.getMachineNode(
12230 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr1,
12231 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12232 0);
12233
12234 MachineSDNode *ExtNode =
12235 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Dmr0Lo);
12236 Values.push_back(Elt: SDValue(ExtNode, 0));
12237 Values.push_back(Elt: SDValue(ExtNode, 1));
12238 ExtNode =
12239 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Dmr0Hi);
12240 Values.push_back(Elt: SDValue(ExtNode, 0));
12241 Values.push_back(Elt: SDValue(ExtNode, 1));
12242 ExtNode = DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Dmr1Lo);
12243 Values.push_back(Elt: SDValue(ExtNode, 0));
12244 Values.push_back(Elt: SDValue(ExtNode, 1));
12245 ExtNode =
12246 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Dmr1Hi);
12247 Values.push_back(Elt: SDValue(ExtNode, 0));
12248 Values.push_back(Elt: SDValue(ExtNode, 1));
12249 }
12250
12251 if (Subtarget.isLittleEndian())
12252 std::reverse(first: Values.begin(), last: Values.end());
12253
12254 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
12255 SmallVector<SDValue, 4> Ops{
12256 StoreChain, DAG.getConstant(Val: Intrinsic::ppc_vsx_stxvp, DL: dl, VT: MVT::i32),
12257 Values[0], BasePtr};
12258 MachineMemOperand *MMO = SN->getMemOperand();
12259 unsigned NumVecs = VT.getSizeInBits() / 256;
12260 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12261 MachineMemOperand *NewMMO =
12262 DAG.getMachineFunction().getMachineMemOperand(MMO, Offset: Idx * 32, Size: 32);
12263 if (Idx > 0) {
12264 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12265 N2: DAG.getConstant(Val: 32, DL: dl, VT: BasePtr.getValueType()));
12266 Ops[3] = BasePtr;
12267 }
12268 Ops[2] = Values[Idx];
12269 SDValue St = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops,
12270 MemVT: MVT::v256i1, MMO: NewMMO);
12271 Stores.push_back(Elt: St);
12272 }
12273
12274 SDValue TF = DAG.getTokenFactor(DL: dl, Vals&: Stores);
12275 return TF;
12276}
12277
12278SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
12279 SelectionDAG &DAG) const {
12280 SDLoc dl(Op);
12281 StoreSDNode *SN = cast<StoreSDNode>(Val: Op.getNode());
12282 SDValue StoreChain = SN->getChain();
12283 SDValue BasePtr = SN->getBasePtr();
12284 SDValue Value = SN->getValue();
12285 SDValue Value2 = SN->getValue();
12286 EVT StoreVT = Value.getValueType();
12287
12288 if (StoreVT == MVT::v1024i1 || StoreVT == MVT::v2048i1)
12289 return LowerDMFVectorStore(Op, DAG);
12290
12291 if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
12292 return Op;
12293
12294 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
12295 // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
12296 // underlying registers individually.
12297 assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
12298 "Type unsupported without MMA");
12299 assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
12300 "Type unsupported without paired vector support");
12301 Align Alignment = SN->getAlign();
12302 SmallVector<SDValue, 4> Stores;
12303 unsigned NumVecs = 2;
12304 if (StoreVT == MVT::v512i1) {
12305 if (Subtarget.isISAFuture()) {
12306 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
12307 MachineSDNode *ExtNode = DAG.getMachineNode(
12308 Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Op.getOperand(i: 1));
12309
12310 Value = SDValue(ExtNode, 0);
12311 Value2 = SDValue(ExtNode, 1);
12312 } else
12313 Value = DAG.getNode(Opcode: PPCISD::XXMFACC, DL: dl, VT: MVT::v512i1, Operand: Value);
12314 NumVecs = 4;
12315 }
12316 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12317 unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
12318 SDValue Elt;
12319 if (Subtarget.isISAFuture()) {
12320 VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
12321 Elt = DAG.getNode(Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
12322 N1: Idx > 1 ? Value2 : Value,
12323 N2: DAG.getConstant(Val: VecNum, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
12324 } else
12325 Elt = DAG.getNode(Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8, N1: Value,
12326 N2: DAG.getConstant(Val: VecNum, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
12327
12328 SDValue Store =
12329 DAG.getStore(Chain: StoreChain, dl, Val: Elt, Ptr: BasePtr,
12330 PtrInfo: SN->getPointerInfo().getWithOffset(O: Idx * 16),
12331 Alignment: commonAlignment(A: Alignment, Offset: Idx * 16),
12332 MMOFlags: SN->getMemOperand()->getFlags(), AAInfo: SN->getAAInfo());
12333 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12334 N2: DAG.getConstant(Val: 16, DL: dl, VT: BasePtr.getValueType()));
12335 Stores.push_back(Elt: Store);
12336 }
12337 SDValue TF = DAG.getTokenFactor(DL: dl, Vals&: Stores);
12338 return TF;
12339}
12340
12341SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
12342 SDLoc dl(Op);
12343 if (Op.getValueType() == MVT::v4i32) {
12344 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
12345
12346 SDValue Zero = getCanonicalConstSplat(Val: 0, SplatSize: 1, VT: MVT::v4i32, DAG, dl);
12347 // +16 as shift amt.
12348 SDValue Neg16 = getCanonicalConstSplat(Val: -16, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
12349 SDValue RHSSwap = // = vrlw RHS, 16
12350 BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vrlw, LHS: RHS, RHS: Neg16, DAG, dl);
12351
12352 // Shrinkify inputs to v8i16.
12353 LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: LHS);
12354 RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: RHS);
12355 RHSSwap = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: RHSSwap);
12356
12357 // Low parts multiplied together, generating 32-bit results (we ignore the
12358 // top parts).
12359 SDValue LoProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmulouh,
12360 LHS, RHS, DAG, dl, DestVT: MVT::v4i32);
12361
12362 SDValue HiProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmsumuhm,
12363 Op0: LHS, Op1: RHSSwap, Op2: Zero, DAG, dl, DestVT: MVT::v4i32);
12364 // Shift the high parts up 16 bits.
12365 HiProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vslw, LHS: HiProd,
12366 RHS: Neg16, DAG, dl);
12367 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::v4i32, N1: LoProd, N2: HiProd);
12368 } else if (Op.getValueType() == MVT::v16i8) {
12369 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
12370 bool isLittleEndian = Subtarget.isLittleEndian();
12371
12372 // Multiply the even 8-bit parts, producing 16-bit sums.
12373 SDValue EvenParts = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmuleub,
12374 LHS, RHS, DAG, dl, DestVT: MVT::v8i16);
12375 EvenParts = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: EvenParts);
12376
12377 // Multiply the odd 8-bit parts, producing 16-bit sums.
12378 SDValue OddParts = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmuloub,
12379 LHS, RHS, DAG, dl, DestVT: MVT::v8i16);
12380 OddParts = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OddParts);
12381
12382 // Merge the results together. Because vmuleub and vmuloub are
12383 // instructions with a big-endian bias, we must reverse the
12384 // element numbering and reverse the meaning of "odd" and "even"
12385 // when generating little endian code.
12386 int Ops[16];
12387 for (unsigned i = 0; i != 8; ++i) {
12388 if (isLittleEndian) {
12389 Ops[i*2 ] = 2*i;
12390 Ops[i*2+1] = 2*i+16;
12391 } else {
12392 Ops[i*2 ] = 2*i+1;
12393 Ops[i*2+1] = 2*i+1+16;
12394 }
12395 }
12396 if (isLittleEndian)
12397 return DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: OddParts, N2: EvenParts, Mask: Ops);
12398 else
12399 return DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: EvenParts, N2: OddParts, Mask: Ops);
12400 } else {
12401 llvm_unreachable("Unknown mul to lower!");
12402 }
12403}
12404
12405SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
12406 bool IsStrict = Op->isStrictFPOpcode();
12407 if (Op.getOperand(i: IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
12408 !Subtarget.hasP9Vector())
12409 return SDValue();
12410
12411 return Op;
12412}
12413
12414// Custom lowering for fpext vf32 to v2f64
12415SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
12416
12417 assert(Op.getOpcode() == ISD::FP_EXTEND &&
12418 "Should only be called for ISD::FP_EXTEND");
12419
12420 // FIXME: handle extends from half precision float vectors on P9.
12421 // We only want to custom lower an extend from v2f32 to v2f64.
12422 if (Op.getValueType() != MVT::v2f64 ||
12423 Op.getOperand(i: 0).getValueType() != MVT::v2f32)
12424 return SDValue();
12425
12426 SDLoc dl(Op);
12427 SDValue Op0 = Op.getOperand(i: 0);
12428
12429 switch (Op0.getOpcode()) {
12430 default:
12431 return SDValue();
12432 case ISD::EXTRACT_SUBVECTOR: {
12433 assert(Op0.getNumOperands() == 2 &&
12434 isa<ConstantSDNode>(Op0->getOperand(1)) &&
12435 "Node should have 2 operands with second one being a constant!");
12436
12437 if (Op0.getOperand(i: 0).getValueType() != MVT::v4f32)
12438 return SDValue();
12439
12440 // Custom lower is only done for high or low doubleword.
12441 int Idx = Op0.getConstantOperandVal(i: 1);
12442 if (Idx % 2 != 0)
12443 return SDValue();
12444
12445 // Since input is v4f32, at this point Idx is either 0 or 2.
12446 // Shift to get the doubleword position we want.
12447 int DWord = Idx >> 1;
12448
12449 // High and low word positions are different on little endian.
12450 if (Subtarget.isLittleEndian())
12451 DWord ^= 0x1;
12452
12453 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64,
12454 N1: Op0.getOperand(i: 0), N2: DAG.getConstant(Val: DWord, DL: dl, VT: MVT::i32));
12455 }
12456 case ISD::FADD:
12457 case ISD::FMUL:
12458 case ISD::FSUB: {
12459 SDValue NewLoad[2];
12460 for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
12461 // Ensure both input are loads.
12462 SDValue LdOp = Op0.getOperand(i);
12463 if (LdOp.getOpcode() != ISD::LOAD)
12464 return SDValue();
12465 // Generate new load node.
12466 LoadSDNode *LD = cast<LoadSDNode>(Val&: LdOp);
12467 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
12468 NewLoad[i] = DAG.getMemIntrinsicNode(
12469 Opcode: PPCISD::LD_VSX_LH, dl, VTList: DAG.getVTList(VT1: MVT::v4f32, VT2: MVT::Other), Ops: LoadOps,
12470 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
12471 }
12472 SDValue NewOp =
12473 DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc(Op0), VT: MVT::v4f32, N1: NewLoad[0],
12474 N2: NewLoad[1], Flags: Op0.getNode()->getFlags());
12475 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64, N1: NewOp,
12476 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
12477 }
12478 case ISD::LOAD: {
12479 LoadSDNode *LD = cast<LoadSDNode>(Val&: Op0);
12480 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
12481 SDValue NewLd = DAG.getMemIntrinsicNode(
12482 Opcode: PPCISD::LD_VSX_LH, dl, VTList: DAG.getVTList(VT1: MVT::v4f32, VT2: MVT::Other), Ops: LoadOps,
12483 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
12484 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64, N1: NewLd,
12485 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
12486 }
12487 }
12488 llvm_unreachable("ERROR:Should return for all cases within swtich.");
12489}
12490
12491static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value,
12492 SelectionDAG &DAG,
12493 const PPCSubtarget &STI) {
12494 SDLoc DL(Value);
12495 if (STI.useCRBits())
12496 Value = DAG.getNode(Opcode: ISD::SELECT, DL, VT: SumType, N1: Value,
12497 N2: DAG.getConstant(Val: 1, DL, VT: SumType),
12498 N3: DAG.getConstant(Val: 0, DL, VT: SumType));
12499 else
12500 Value = DAG.getZExtOrTrunc(Op: Value, DL, VT: SumType);
12501 SDValue Sum = DAG.getNode(Opcode: PPCISD::ADDC, DL, VTList: DAG.getVTList(VT1: SumType, VT2: MVT::i32),
12502 N1: Value, N2: DAG.getAllOnesConstant(DL, VT: SumType));
12503 return Sum.getValue(R: 1);
12504}
12505
12506static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag,
12507 EVT CarryType, SelectionDAG &DAG,
12508 const PPCSubtarget &STI) {
12509 SDLoc DL(Flag);
12510 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: SumType);
12511 SDValue Carry = DAG.getNode(
12512 Opcode: PPCISD::ADDE, DL, VTList: DAG.getVTList(VT1: SumType, VT2: MVT::i32), N1: Zero, N2: Zero, N3: Flag);
12513 if (STI.useCRBits())
12514 return DAG.getSetCC(DL, VT: CarryType, LHS: Carry, RHS: Zero, Cond: ISD::SETNE);
12515 return DAG.getZExtOrTrunc(Op: Carry, DL, VT: CarryType);
12516}
12517
12518SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const {
12519
12520 SDLoc DL(Op);
12521 SDNode *N = Op.getNode();
12522 EVT VT = N->getValueType(ResNo: 0);
12523 EVT CarryType = N->getValueType(ResNo: 1);
12524 unsigned Opc = N->getOpcode();
12525 bool IsAdd = Opc == ISD::UADDO;
12526 Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC;
12527 SDValue Sum = DAG.getNode(Opcode: Opc, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
12528 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
12529 SDValue Carry = ConvertCarryFlagToCarryValue(SumType: VT, Flag: Sum.getValue(R: 1), CarryType,
12530 DAG, STI: Subtarget);
12531 if (!IsAdd)
12532 Carry = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryType, N1: Carry,
12533 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryType));
12534 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Sum, N2: Carry);
12535}
12536
12537SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op,
12538 SelectionDAG &DAG) const {
12539 SDLoc DL(Op);
12540 SDNode *N = Op.getNode();
12541 unsigned Opc = N->getOpcode();
12542 EVT VT = N->getValueType(ResNo: 0);
12543 EVT CarryType = N->getValueType(ResNo: 1);
12544 SDValue CarryOp = N->getOperand(Num: 2);
12545 bool IsAdd = Opc == ISD::UADDO_CARRY;
12546 Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE;
12547 if (!IsAdd)
12548 CarryOp = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryOp.getValueType(), N1: CarryOp,
12549 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryOp.getValueType()));
12550 CarryOp = ConvertCarryValueToCarryFlag(SumType: VT, Value: CarryOp, DAG, STI: Subtarget);
12551 SDValue Sum = DAG.getNode(Opcode: Opc, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
12552 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: CarryOp);
12553 CarryOp = ConvertCarryFlagToCarryValue(SumType: VT, Flag: Sum.getValue(R: 1), CarryType, DAG,
12554 STI: Subtarget);
12555 if (!IsAdd)
12556 CarryOp = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryOp.getValueType(), N1: CarryOp,
12557 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryOp.getValueType()));
12558 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Sum, N2: CarryOp);
12559}
12560
12561SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const {
12562
12563 SDLoc dl(Op);
12564 SDValue LHS = Op.getOperand(i: 0);
12565 SDValue RHS = Op.getOperand(i: 1);
12566 EVT VT = Op.getNode()->getValueType(ResNo: 0);
12567
12568 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LHS, N2: RHS);
12569
12570 SDValue Xor1 = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: RHS, N2: LHS);
12571 SDValue Xor2 = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: Sub, N2: LHS);
12572
12573 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: Xor1, N2: Xor2);
12574
12575 SDValue Overflow =
12576 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: And,
12577 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT: MVT::i32));
12578
12579 SDValue OverflowTrunc =
12580 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: Op.getNode()->getValueType(ResNo: 1), Operand: Overflow);
12581
12582 return DAG.getMergeValues(Ops: {Sub, OverflowTrunc}, dl);
12583}
12584
12585/// Implements signed add with overflow detection using the rule:
12586/// (x eqv y) & (sum xor x), where the overflow bit is extracted from the sign
12587SDValue PPCTargetLowering::LowerSADDO(SDValue Op, SelectionDAG &DAG) const {
12588
12589 SDLoc dl(Op);
12590 SDValue LHS = Op.getOperand(i: 0);
12591 SDValue RHS = Op.getOperand(i: 1);
12592 EVT VT = Op.getNode()->getValueType(ResNo: 0);
12593
12594 SDValue Sum = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: LHS, N2: RHS);
12595
12596 // Compute ~(x xor y)
12597 SDValue XorXY = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: LHS, N2: RHS);
12598 SDValue EqvXY = DAG.getNOT(DL: dl, Val: XorXY, VT);
12599 // Compute (s xor x)
12600 SDValue SumXorX = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: Sum, N2: LHS);
12601
12602 // overflow = (x eqv y) & (s xor x)
12603 SDValue OverflowInSign = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: EqvXY, N2: SumXorX);
12604
12605 // Shift sign bit down to LSB
12606 SDValue Overflow =
12607 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: OverflowInSign,
12608 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT: MVT::i32));
12609 // Truncate to the overflow type (i1)
12610 SDValue OverflowTrunc =
12611 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: Op.getNode()->getValueType(ResNo: 1), Operand: Overflow);
12612
12613 return DAG.getMergeValues(Ops: {Sum, OverflowTrunc}, dl);
12614}
12615
12616// Lower unsigned 3-way compare producing -1/0/1.
12617SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
12618 SDLoc DL(Op);
12619 SDValue A = DAG.getFreeze(V: Op.getOperand(i: 0));
12620 SDValue B = DAG.getFreeze(V: Op.getOperand(i: 1));
12621 EVT OpVT = A.getValueType();
12622 EVT ResVT = Op.getValueType();
12623
12624 // On PPC64, i32 carries are affected by the upper 32 bits of the registers.
12625 // We must zero-extend to i64 to ensure the carry reflects the 32-bit unsigned
12626 // comparison.
12627 if (Subtarget.isPPC64() && OpVT == MVT::i32) {
12628 A = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: A);
12629 B = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: B);
12630 OpVT = MVT::i64;
12631 }
12632
12633 // First compute diff = A - B.
12634 SDValue Diff = DAG.getNode(Opcode: ISD::SUB, DL, VT: OpVT, N1: A, N2: B);
12635
12636 // Generate B - A using SUBC to capture carry.
12637 SDVTList VTs = DAG.getVTList(VT1: OpVT, VT2: MVT::i32);
12638 SDValue SubC = DAG.getNode(Opcode: PPCISD::SUBC, DL, VTList: VTs, N1: B, N2: A);
12639 SDValue CA0 = SubC.getValue(R: 1);
12640
12641 // t2 = A - B + CA0 using SUBE.
12642 SDValue SubE1 = DAG.getNode(Opcode: PPCISD::SUBE, DL, VTList: VTs, N1: A, N2: B, N3: CA0);
12643 SDValue CA1 = SubE1.getValue(R: 1);
12644
12645 // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1).
12646 SDValue ResPair = DAG.getNode(Opcode: PPCISD::SUBE, DL, VTList: VTs, N1: Diff, N2: SubE1, N3: CA1);
12647
12648 // Extract the first result and truncate to result type if needed.
12649 return DAG.getSExtOrTrunc(Op: ResPair.getValue(R: 0), DL, VT: ResVT);
12650}
12651
12652/// LowerOperation - Provide custom lowering hooks for some operations.
12653///
12654SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
12655 switch (Op.getOpcode()) {
12656 default:
12657 llvm_unreachable("Wasn't expecting to be able to lower this!");
12658 case ISD::FPOW: return lowerPow(Op, DAG);
12659 case ISD::FSIN: return lowerSin(Op, DAG);
12660 case ISD::FCOS: return lowerCos(Op, DAG);
12661 case ISD::FLOG: return lowerLog(Op, DAG);
12662 case ISD::FLOG10: return lowerLog10(Op, DAG);
12663 case ISD::FEXP: return lowerExp(Op, DAG);
12664 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
12665 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
12666 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
12667 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
12668 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
12669 case ISD::STRICT_FSETCC:
12670 case ISD::STRICT_FSETCCS:
12671 case ISD::SETCC: return LowerSETCC(Op, DAG);
12672 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
12673 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
12674 case ISD::SSUBO:
12675 return LowerSSUBO(Op, DAG);
12676 case ISD::SADDO:
12677 return LowerSADDO(Op, DAG);
12678
12679 case ISD::INLINEASM:
12680 case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG);
12681 // Variable argument lowering.
12682 case ISD::VASTART: return LowerVASTART(Op, DAG);
12683 case ISD::VAARG: return LowerVAARG(Op, DAG);
12684 case ISD::VACOPY: return LowerVACOPY(Op, DAG);
12685
12686 case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
12687 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
12688 case ISD::GET_DYNAMIC_AREA_OFFSET:
12689 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
12690
12691 // Exception handling lowering.
12692 case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
12693 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
12694 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
12695
12696 case ISD::LOAD: return LowerLOAD(Op, DAG);
12697 case ISD::STORE: return LowerSTORE(Op, DAG);
12698 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
12699 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
12700 case ISD::STRICT_FP_TO_UINT:
12701 case ISD::STRICT_FP_TO_SINT:
12702 case ISD::FP_TO_UINT:
12703 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, dl: SDLoc(Op));
12704 case ISD::STRICT_UINT_TO_FP:
12705 case ISD::STRICT_SINT_TO_FP:
12706 case ISD::UINT_TO_FP:
12707 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
12708 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
12709 case ISD::SET_ROUNDING:
12710 return LowerSET_ROUNDING(Op, DAG);
12711
12712 // Lower 64-bit shifts.
12713 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
12714 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
12715 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
12716
12717 case ISD::FSHL: return LowerFunnelShift(Op, DAG);
12718 case ISD::FSHR: return LowerFunnelShift(Op, DAG);
12719
12720 // Vector-related lowering.
12721 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
12722 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
12723 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
12724 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
12725 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
12726 case ISD::MUL: return LowerMUL(Op, DAG);
12727 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
12728 case ISD::STRICT_FP_ROUND:
12729 case ISD::FP_ROUND:
12730 return LowerFP_ROUND(Op, DAG);
12731 case ISD::ROTL: return LowerROTL(Op, DAG);
12732
12733 // For counter-based loop handling.
12734 case ISD::INTRINSIC_W_CHAIN:
12735 return SDValue();
12736
12737 case ISD::BITCAST: return LowerBITCAST(Op, DAG);
12738
12739 // Frame & Return address.
12740 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
12741 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
12742
12743 case ISD::INTRINSIC_VOID:
12744 return LowerINTRINSIC_VOID(Op, DAG);
12745 case ISD::BSWAP:
12746 return LowerBSWAP(Op, DAG);
12747 case ISD::ATOMIC_CMP_SWAP:
12748 return LowerATOMIC_CMP_SWAP(Op, DAG);
12749 case ISD::ATOMIC_STORE:
12750 return LowerATOMIC_LOAD_STORE(Op, DAG);
12751 case ISD::IS_FPCLASS:
12752 return LowerIS_FPCLASS(Op, DAG);
12753 case ISD::UADDO:
12754 case ISD::USUBO:
12755 return LowerADDSUBO(Op, DAG);
12756 case ISD::UADDO_CARRY:
12757 case ISD::USUBO_CARRY:
12758 return LowerADDSUBO_CARRY(Op, DAG);
12759 case ISD::UCMP:
12760 return LowerUCMP(Op, DAG);
12761 case ISD::STRICT_LRINT:
12762 case ISD::STRICT_LLRINT:
12763 case ISD::STRICT_LROUND:
12764 case ISD::STRICT_LLROUND:
12765 case ISD::STRICT_FNEARBYINT:
12766 if (Op->getFlags().hasNoFPExcept())
12767 return Op;
12768 return SDValue();
12769 case ISD::VP_LOAD:
12770 return LowerVP_LOAD(Op, DAG);
12771 case ISD::VP_STORE:
12772 return LowerVP_STORE(Op, DAG);
12773 }
12774}
12775
12776void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
12777 SmallVectorImpl<SDValue>&Results,
12778 SelectionDAG &DAG) const {
12779 SDLoc dl(N);
12780 switch (N->getOpcode()) {
12781 default:
12782 llvm_unreachable("Do not know how to custom type legalize this operation!");
12783 case ISD::ATOMIC_LOAD: {
12784 SDValue Res = LowerATOMIC_LOAD_STORE(Op: SDValue(N, 0), DAG);
12785 Results.push_back(Elt: Res);
12786 Results.push_back(Elt: Res.getValue(R: 1));
12787 break;
12788 }
12789 case ISD::READCYCLECOUNTER: {
12790 SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other);
12791 SDValue RTB = DAG.getNode(Opcode: PPCISD::READ_TIME_BASE, DL: dl, VTList: VTs, N: N->getOperand(Num: 0));
12792
12793 Results.push_back(
12794 Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: RTB, N2: RTB.getValue(R: 1)));
12795 Results.push_back(Elt: RTB.getValue(R: 2));
12796 break;
12797 }
12798 case ISD::INTRINSIC_W_CHAIN: {
12799 if (N->getConstantOperandVal(Num: 1) != Intrinsic::loop_decrement)
12800 break;
12801
12802 assert(N->getValueType(0) == MVT::i1 &&
12803 "Unexpected result type for CTR decrement intrinsic");
12804 EVT SVT = getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(),
12805 VT: N->getValueType(ResNo: 0));
12806 SDVTList VTs = DAG.getVTList(VT1: SVT, VT2: MVT::Other);
12807 SDValue NewInt = DAG.getNode(Opcode: N->getOpcode(), DL: dl, VTList: VTs, N1: N->getOperand(Num: 0),
12808 N2: N->getOperand(Num: 1));
12809
12810 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: NewInt));
12811 Results.push_back(Elt: NewInt.getValue(R: 1));
12812 break;
12813 }
12814 case ISD::INTRINSIC_WO_CHAIN: {
12815 switch (N->getConstantOperandVal(Num: 0)) {
12816 case Intrinsic::ppc_pack_longdouble:
12817 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::ppcf128,
12818 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 1)));
12819 break;
12820 case Intrinsic::ppc_maxfe:
12821 case Intrinsic::ppc_minfe:
12822 case Intrinsic::ppc_fnmsub:
12823 case Intrinsic::ppc_convert_f128_to_ppcf128:
12824 Results.push_back(Elt: LowerINTRINSIC_WO_CHAIN(Op: SDValue(N, 0), DAG));
12825 break;
12826 }
12827 break;
12828 }
12829 case ISD::VAARG: {
12830 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
12831 return;
12832
12833 EVT VT = N->getValueType(ResNo: 0);
12834
12835 if (VT == MVT::i64) {
12836 SDValue NewNode = LowerVAARG(Op: SDValue(N, 1), DAG);
12837
12838 Results.push_back(Elt: NewNode);
12839 Results.push_back(Elt: NewNode.getValue(R: 1));
12840 }
12841 return;
12842 }
12843 case ISD::STRICT_FP_TO_SINT:
12844 case ISD::STRICT_FP_TO_UINT:
12845 case ISD::FP_TO_SINT:
12846 case ISD::FP_TO_UINT: {
12847 // LowerFP_TO_INT() can only handle f32 and f64.
12848 if (N->getOperand(Num: N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
12849 MVT::ppcf128)
12850 return;
12851 SDValue LoweredValue = LowerFP_TO_INT(Op: SDValue(N, 0), DAG, dl);
12852 Results.push_back(Elt: LoweredValue);
12853 if (N->isStrictFPOpcode())
12854 Results.push_back(Elt: LoweredValue.getValue(R: 1));
12855 return;
12856 }
12857 case ISD::TRUNCATE: {
12858 if (!N->getValueType(ResNo: 0).isVector())
12859 return;
12860 SDValue Lowered = LowerTRUNCATEVector(Op: SDValue(N, 0), DAG);
12861 if (Lowered)
12862 Results.push_back(Elt: Lowered);
12863 return;
12864 }
12865 case ISD::SCALAR_TO_VECTOR: {
12866 SDValue Lowered = LowerSCALAR_TO_VECTOR(Op: SDValue(N, 0), DAG);
12867 if (Lowered)
12868 Results.push_back(Elt: Lowered);
12869 return;
12870 }
12871 case ISD::FSHL:
12872 case ISD::FSHR:
12873 // Don't handle funnel shifts here.
12874 return;
12875 case ISD::BITCAST:
12876 // Don't handle bitcast here.
12877 return;
12878 case ISD::FP_EXTEND:
12879 SDValue Lowered = LowerFP_EXTEND(Op: SDValue(N, 0), DAG);
12880 if (Lowered)
12881 Results.push_back(Elt: Lowered);
12882 return;
12883 }
12884}
12885
12886//===----------------------------------------------------------------------===//
12887// Other Lowering Code
12888//===----------------------------------------------------------------------===//
12889
12890static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
12891 return Builder.CreateIntrinsic(ID: Id, Args: {});
12892}
12893
12894Value *PPCTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
12895 Value *Addr,
12896 AtomicOrdering Ord) const {
12897 unsigned SZ = ValueTy->getPrimitiveSizeInBits();
12898
12899 assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
12900 "Only 8/16/32/64-bit atomic loads supported");
12901 Intrinsic::ID IntID;
12902 switch (SZ) {
12903 default:
12904 llvm_unreachable("Unexpected PrimitiveSize");
12905 case 8:
12906 IntID = Intrinsic::ppc_lbarx;
12907 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12908 break;
12909 case 16:
12910 IntID = Intrinsic::ppc_lharx;
12911 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12912 break;
12913 case 32:
12914 IntID = Intrinsic::ppc_lwarx;
12915 break;
12916 case 64:
12917 IntID = Intrinsic::ppc_ldarx;
12918 break;
12919 }
12920 Value *Call =
12921 Builder.CreateIntrinsic(ID: IntID, Args: Addr, /*FMFSource=*/nullptr, Name: "larx");
12922
12923 return Builder.CreateTruncOrBitCast(V: Call, DestTy: ValueTy);
12924}
12925
12926// Perform a store-conditional operation to Addr. Return the status of the
12927// store. This should be 0 if the store succeeded, non-zero otherwise.
12928Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
12929 Value *Val, Value *Addr,
12930 AtomicOrdering Ord) const {
12931 Type *Ty = Val->getType();
12932 unsigned SZ = Ty->getPrimitiveSizeInBits();
12933
12934 assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
12935 "Only 8/16/32/64-bit atomic loads supported");
12936 Intrinsic::ID IntID;
12937 switch (SZ) {
12938 default:
12939 llvm_unreachable("Unexpected PrimitiveSize");
12940 case 8:
12941 IntID = Intrinsic::ppc_stbcx;
12942 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12943 break;
12944 case 16:
12945 IntID = Intrinsic::ppc_sthcx;
12946 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12947 break;
12948 case 32:
12949 IntID = Intrinsic::ppc_stwcx;
12950 break;
12951 case 64:
12952 IntID = Intrinsic::ppc_stdcx;
12953 break;
12954 }
12955
12956 if (SZ == 8 || SZ == 16)
12957 Val = Builder.CreateZExt(V: Val, DestTy: Builder.getInt32Ty());
12958
12959 Value *Call = Builder.CreateIntrinsic(ID: IntID, Args: {Addr, Val},
12960 /*FMFSource=*/nullptr, Name: "stcx");
12961 return Builder.CreateXor(LHS: Call, RHS: Builder.getInt32(C: 1));
12962}
12963
12964// The mappings for emitLeading/TrailingFence is taken from
12965// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
12966Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
12967 Instruction *Inst,
12968 AtomicOrdering Ord) const {
12969 if (Ord == AtomicOrdering::SequentiallyConsistent)
12970 return callIntrinsic(Builder, Id: Intrinsic::ppc_sync);
12971 if (isReleaseOrStronger(AO: Ord))
12972 return callIntrinsic(Builder, Id: Intrinsic::ppc_lwsync);
12973 return nullptr;
12974}
12975
12976Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
12977 Instruction *Inst,
12978 AtomicOrdering Ord) const {
12979 if (Inst->hasAtomicLoad() && isAcquireOrStronger(AO: Ord)) {
12980 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
12981 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
12982 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
12983 if (isa<LoadInst>(Val: Inst))
12984 return Builder.CreateIntrinsic(ID: Intrinsic::ppc_cfence, Types: {Inst->getType()},
12985 Args: {Inst});
12986 // FIXME: Can use isync for rmw operation.
12987 return callIntrinsic(Builder, Id: Intrinsic::ppc_lwsync);
12988 }
12989 return nullptr;
12990}
12991
12992MachineBasicBlock *
12993PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
12994 unsigned AtomicSize,
12995 unsigned BinOpcode,
12996 unsigned CmpOpcode,
12997 unsigned CmpPred) const {
12998 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
12999 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13000
13001 auto LoadMnemonic = PPC::LDARX;
13002 auto StoreMnemonic = PPC::STDCX;
13003 switch (AtomicSize) {
13004 default:
13005 llvm_unreachable("Unexpected size of atomic entity");
13006 case 1:
13007 LoadMnemonic = PPC::LBARX;
13008 StoreMnemonic = PPC::STBCX;
13009 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
13010 break;
13011 case 2:
13012 LoadMnemonic = PPC::LHARX;
13013 StoreMnemonic = PPC::STHCX;
13014 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
13015 break;
13016 case 4:
13017 LoadMnemonic = PPC::LWARX;
13018 StoreMnemonic = PPC::STWCX;
13019 break;
13020 case 8:
13021 LoadMnemonic = PPC::LDARX;
13022 StoreMnemonic = PPC::STDCX;
13023 break;
13024 }
13025
13026 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13027 MachineFunction *F = BB->getParent();
13028 MachineFunction::iterator It = ++BB->getIterator();
13029
13030 Register dest = MI.getOperand(i: 0).getReg();
13031 Register ptrA = MI.getOperand(i: 1).getReg();
13032 Register ptrB = MI.getOperand(i: 2).getReg();
13033 Register incr = MI.getOperand(i: 3).getReg();
13034 DebugLoc dl = MI.getDebugLoc();
13035
13036 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13037 MachineBasicBlock *loop2MBB =
13038 CmpOpcode ? F->CreateMachineBasicBlock(BB: LLVM_BB) : nullptr;
13039 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13040 F->insert(MBBI: It, MBB: loopMBB);
13041 if (CmpOpcode)
13042 F->insert(MBBI: It, MBB: loop2MBB);
13043 F->insert(MBBI: It, MBB: exitMBB);
13044 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
13045 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13046 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13047
13048 MachineRegisterInfo &RegInfo = F->getRegInfo();
13049 Register TmpReg = (!BinOpcode) ? incr :
13050 RegInfo.createVirtualRegister( RegClass: AtomicSize == 8 ? &PPC::G8RCRegClass
13051 : &PPC::GPRCRegClass);
13052
13053 // thisMBB:
13054 // ...
13055 // fallthrough --> loopMBB
13056 BB->addSuccessor(Succ: loopMBB);
13057
13058 // loopMBB:
13059 // l[wd]arx dest, ptr
13060 // add r0, dest, incr
13061 // st[wd]cx. r0, ptr
13062 // bne- loopMBB
13063 // fallthrough --> exitMBB
13064
13065 // For max/min...
13066 // loopMBB:
13067 // l[wd]arx dest, ptr
13068 // cmpl?[wd] dest, incr
13069 // bgt exitMBB
13070 // loop2MBB:
13071 // st[wd]cx. dest, ptr
13072 // bne- loopMBB
13073 // fallthrough --> exitMBB
13074
13075 BB = loopMBB;
13076 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: LoadMnemonic), DestReg: dest)
13077 .addReg(RegNo: ptrA).addReg(RegNo: ptrB);
13078 if (BinOpcode)
13079 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: BinOpcode), DestReg: TmpReg).addReg(RegNo: incr).addReg(RegNo: dest);
13080 if (CmpOpcode) {
13081 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13082 // Signed comparisons of byte or halfword values must be sign-extended.
13083 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
13084 Register ExtReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13085 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
13086 DestReg: ExtReg).addReg(RegNo: dest);
13087 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: ExtReg).addReg(RegNo: incr);
13088 } else
13089 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: dest).addReg(RegNo: incr);
13090
13091 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13092 .addImm(Val: CmpPred)
13093 .addReg(RegNo: CrReg)
13094 .addMBB(MBB: exitMBB);
13095 BB->addSuccessor(Succ: loop2MBB);
13096 BB->addSuccessor(Succ: exitMBB);
13097 BB = loop2MBB;
13098 }
13099 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: StoreMnemonic))
13100 .addReg(RegNo: TmpReg).addReg(RegNo: ptrA).addReg(RegNo: ptrB);
13101 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13102 .addImm(Val: PPC::PRED_NE_MINUS)
13103 .addReg(RegNo: PPC::CR0)
13104 .addMBB(MBB: loopMBB);
13105 BB->addSuccessor(Succ: loopMBB);
13106 BB->addSuccessor(Succ: exitMBB);
13107
13108 // exitMBB:
13109 // ...
13110 BB = exitMBB;
13111 return BB;
13112}
13113
13114static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
13115 switch(MI.getOpcode()) {
13116 default:
13117 return false;
13118 case PPC::COPY:
13119 return TII->isSignExtended(Reg: MI.getOperand(i: 1).getReg(),
13120 MRI: &MI.getMF()->getRegInfo());
13121 case PPC::LHA:
13122 case PPC::LHA8:
13123 case PPC::LHAU:
13124 case PPC::LHAU8:
13125 case PPC::LHAUX:
13126 case PPC::LHAUX8:
13127 case PPC::LHAX:
13128 case PPC::LHAX8:
13129 case PPC::LWA:
13130 case PPC::LWAUX:
13131 case PPC::LWAX:
13132 case PPC::LWAX_32:
13133 case PPC::LWA_32:
13134 case PPC::PLHA:
13135 case PPC::PLHA8:
13136 case PPC::PLHA8pc:
13137 case PPC::PLHApc:
13138 case PPC::PLWA:
13139 case PPC::PLWA8:
13140 case PPC::PLWA8pc:
13141 case PPC::PLWApc:
13142 case PPC::EXTSB:
13143 case PPC::EXTSB8:
13144 case PPC::EXTSB8_32_64:
13145 case PPC::EXTSB8_rec:
13146 case PPC::EXTSB_rec:
13147 case PPC::EXTSH:
13148 case PPC::EXTSH8:
13149 case PPC::EXTSH8_32_64:
13150 case PPC::EXTSH8_rec:
13151 case PPC::EXTSH_rec:
13152 case PPC::EXTSW:
13153 case PPC::EXTSWSLI:
13154 case PPC::EXTSWSLI_32_64:
13155 case PPC::EXTSWSLI_32_64_rec:
13156 case PPC::EXTSWSLI_rec:
13157 case PPC::EXTSW_32:
13158 case PPC::EXTSW_32_64:
13159 case PPC::EXTSW_32_64_rec:
13160 case PPC::EXTSW_rec:
13161 case PPC::SRAW:
13162 case PPC::SRAWI:
13163 case PPC::SRAWI_rec:
13164 case PPC::SRAW_rec:
13165 return true;
13166 }
13167 return false;
13168}
13169
13170MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
13171 MachineInstr &MI, MachineBasicBlock *BB,
13172 bool is8bit, // operation
13173 unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
13174 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
13175 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
13176
13177 // If this is a signed comparison and the value being compared is not known
13178 // to be sign extended, sign extend it here.
13179 DebugLoc dl = MI.getDebugLoc();
13180 MachineFunction *F = BB->getParent();
13181 MachineRegisterInfo &RegInfo = F->getRegInfo();
13182 Register incr = MI.getOperand(i: 3).getReg();
13183 bool IsSignExtended =
13184 incr.isVirtual() && isSignExtended(MI&: *RegInfo.getVRegDef(Reg: incr), TII);
13185
13186 if (CmpOpcode == PPC::CMPW && !IsSignExtended) {
13187 Register ValueReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13188 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: is8bit ? PPC::EXTSB : PPC::EXTSH), DestReg: ValueReg)
13189 .addReg(RegNo: MI.getOperand(i: 3).getReg());
13190 MI.getOperand(i: 3).setReg(ValueReg);
13191 incr = ValueReg;
13192 }
13193 // If we support part-word atomic mnemonics, just use them
13194 if (Subtarget.hasPartwordAtomics())
13195 return EmitAtomicBinary(MI, BB, AtomicSize: is8bit ? 1 : 2, BinOpcode, CmpOpcode,
13196 CmpPred);
13197
13198 // In 64 bit mode we have to use 64 bits for addresses, even though the
13199 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
13200 // registers without caring whether they're 32 or 64, but here we're
13201 // doing actual arithmetic on the addresses.
13202 bool is64bit = Subtarget.isPPC64();
13203 bool isLittleEndian = Subtarget.isLittleEndian();
13204 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
13205
13206 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13207 MachineFunction::iterator It = ++BB->getIterator();
13208
13209 Register dest = MI.getOperand(i: 0).getReg();
13210 Register ptrA = MI.getOperand(i: 1).getReg();
13211 Register ptrB = MI.getOperand(i: 2).getReg();
13212
13213 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13214 MachineBasicBlock *loop2MBB =
13215 CmpOpcode ? F->CreateMachineBasicBlock(BB: LLVM_BB) : nullptr;
13216 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13217 F->insert(MBBI: It, MBB: loopMBB);
13218 if (CmpOpcode)
13219 F->insert(MBBI: It, MBB: loop2MBB);
13220 F->insert(MBBI: It, MBB: exitMBB);
13221 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
13222 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13223 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13224
13225 const TargetRegisterClass *RC =
13226 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13227 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13228
13229 Register PtrReg = RegInfo.createVirtualRegister(RegClass: RC);
13230 Register Shift1Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13231 Register ShiftReg =
13232 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
13233 Register Incr2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13234 Register MaskReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13235 Register Mask2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13236 Register Mask3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13237 Register Tmp2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13238 Register Tmp3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13239 Register Tmp4Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13240 Register TmpDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13241 Register SrwDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13242 Register Ptr1Reg;
13243 Register TmpReg =
13244 (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
13245
13246 // thisMBB:
13247 // ...
13248 // fallthrough --> loopMBB
13249 BB->addSuccessor(Succ: loopMBB);
13250
13251 // The 4-byte load must be aligned, while a char or short may be
13252 // anywhere in the word. Hence all this nasty bookkeeping code.
13253 // add ptr1, ptrA, ptrB [copy if ptrA==0]
13254 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
13255 // xori shift, shift1, 24 [16]
13256 // rlwinm ptr, ptr1, 0, 0, 29
13257 // slw incr2, incr, shift
13258 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
13259 // slw mask, mask2, shift
13260 // loopMBB:
13261 // lwarx tmpDest, ptr
13262 // add tmp, tmpDest, incr2
13263 // andc tmp2, tmpDest, mask
13264 // and tmp3, tmp, mask
13265 // or tmp4, tmp3, tmp2
13266 // stwcx. tmp4, ptr
13267 // bne- loopMBB
13268 // fallthrough --> exitMBB
13269 // srw SrwDest, tmpDest, shift
13270 // rlwinm SrwDest, SrwDest, 0, 24 [16], 31
13271 if (ptrA != ZeroReg) {
13272 Ptr1Reg = RegInfo.createVirtualRegister(RegClass: RC);
13273 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::ADD8 : PPC::ADD4), DestReg: Ptr1Reg)
13274 .addReg(RegNo: ptrA)
13275 .addReg(RegNo: ptrB);
13276 } else {
13277 Ptr1Reg = ptrB;
13278 }
13279 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
13280 // mode.
13281 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: Shift1Reg)
13282 .addReg(RegNo: Ptr1Reg, Flags: {}, SubReg: is64bit ? PPC::sub_32 : 0)
13283 .addImm(Val: 3)
13284 .addImm(Val: 27)
13285 .addImm(Val: is8bit ? 28 : 27);
13286 if (!isLittleEndian)
13287 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::XORI), DestReg: ShiftReg)
13288 .addReg(RegNo: Shift1Reg)
13289 .addImm(Val: is8bit ? 24 : 16);
13290 if (is64bit)
13291 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDICR), DestReg: PtrReg)
13292 .addReg(RegNo: Ptr1Reg)
13293 .addImm(Val: 0)
13294 .addImm(Val: 61);
13295 else
13296 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: PtrReg)
13297 .addReg(RegNo: Ptr1Reg)
13298 .addImm(Val: 0)
13299 .addImm(Val: 0)
13300 .addImm(Val: 29);
13301 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: Incr2Reg).addReg(RegNo: incr).addReg(RegNo: ShiftReg);
13302 if (is8bit)
13303 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask2Reg).addImm(Val: 255);
13304 else {
13305 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask3Reg).addImm(Val: 0);
13306 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ORI), DestReg: Mask2Reg)
13307 .addReg(RegNo: Mask3Reg)
13308 .addImm(Val: 65535);
13309 }
13310 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: MaskReg)
13311 .addReg(RegNo: Mask2Reg)
13312 .addReg(RegNo: ShiftReg);
13313
13314 BB = loopMBB;
13315 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LWARX), DestReg: TmpDestReg)
13316 .addReg(RegNo: ZeroReg)
13317 .addReg(RegNo: PtrReg);
13318 if (BinOpcode)
13319 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: BinOpcode), DestReg: TmpReg)
13320 .addReg(RegNo: Incr2Reg)
13321 .addReg(RegNo: TmpDestReg);
13322 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ANDC), DestReg: Tmp2Reg)
13323 .addReg(RegNo: TmpDestReg)
13324 .addReg(RegNo: MaskReg);
13325 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: Tmp3Reg).addReg(RegNo: TmpReg).addReg(RegNo: MaskReg);
13326 if (CmpOpcode) {
13327 // For unsigned comparisons, we can directly compare the shifted values.
13328 // For signed comparisons we shift and sign extend.
13329 Register SReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13330 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13331 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: SReg)
13332 .addReg(RegNo: TmpDestReg)
13333 .addReg(RegNo: MaskReg);
13334 unsigned ValueReg = SReg;
13335 unsigned CmpReg = Incr2Reg;
13336 if (CmpOpcode == PPC::CMPW) {
13337 ValueReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13338 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: ValueReg)
13339 .addReg(RegNo: SReg)
13340 .addReg(RegNo: ShiftReg);
13341 Register ValueSReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13342 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is8bit ? PPC::EXTSB : PPC::EXTSH), DestReg: ValueSReg)
13343 .addReg(RegNo: ValueReg);
13344 ValueReg = ValueSReg;
13345 CmpReg = incr;
13346 }
13347 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: ValueReg).addReg(RegNo: CmpReg);
13348 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13349 .addImm(Val: CmpPred)
13350 .addReg(RegNo: CrReg)
13351 .addMBB(MBB: exitMBB);
13352 BB->addSuccessor(Succ: loop2MBB);
13353 BB->addSuccessor(Succ: exitMBB);
13354 BB = loop2MBB;
13355 }
13356 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::OR), DestReg: Tmp4Reg).addReg(RegNo: Tmp3Reg).addReg(RegNo: Tmp2Reg);
13357 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::STWCX))
13358 .addReg(RegNo: Tmp4Reg)
13359 .addReg(RegNo: ZeroReg)
13360 .addReg(RegNo: PtrReg);
13361 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13362 .addImm(Val: PPC::PRED_NE_MINUS)
13363 .addReg(RegNo: PPC::CR0)
13364 .addMBB(MBB: loopMBB);
13365 BB->addSuccessor(Succ: loopMBB);
13366 BB->addSuccessor(Succ: exitMBB);
13367
13368 // exitMBB:
13369 // ...
13370 BB = exitMBB;
13371 // Since the shift amount is not a constant, we need to clear
13372 // the upper bits with a separate RLWINM.
13373 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: dest)
13374 .addReg(RegNo: SrwDestReg)
13375 .addImm(Val: 0)
13376 .addImm(Val: is8bit ? 24 : 16)
13377 .addImm(Val: 31);
13378 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: SrwDestReg)
13379 .addReg(RegNo: TmpDestReg)
13380 .addReg(RegNo: ShiftReg);
13381 return BB;
13382}
13383
13384llvm::MachineBasicBlock *
13385PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
13386 MachineBasicBlock *MBB) const {
13387 DebugLoc DL = MI.getDebugLoc();
13388 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13389 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
13390
13391 MachineFunction *MF = MBB->getParent();
13392 MachineRegisterInfo &MRI = MF->getRegInfo();
13393
13394 const BasicBlock *BB = MBB->getBasicBlock();
13395 MachineFunction::iterator I = ++MBB->getIterator();
13396
13397 Register DstReg = MI.getOperand(i: 0).getReg();
13398 const TargetRegisterClass *RC = MRI.getRegClass(Reg: DstReg);
13399 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
13400 Register mainDstReg = MRI.createVirtualRegister(RegClass: RC);
13401 Register restoreDstReg = MRI.createVirtualRegister(RegClass: RC);
13402
13403 MVT PVT = getPointerTy(DL: MF->getDataLayout());
13404 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13405 "Invalid Pointer Size!");
13406 // For v = setjmp(buf), we generate
13407 //
13408 // thisMBB:
13409 // SjLjSetup mainMBB
13410 // bl mainMBB
13411 // v_restore = 1
13412 // b sinkMBB
13413 //
13414 // mainMBB:
13415 // buf[LabelOffset] = LR
13416 // v_main = 0
13417 //
13418 // sinkMBB:
13419 // v = phi(main, restore)
13420 //
13421
13422 MachineBasicBlock *thisMBB = MBB;
13423 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13424 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13425 MF->insert(MBBI: I, MBB: mainMBB);
13426 MF->insert(MBBI: I, MBB: sinkMBB);
13427
13428 MachineInstrBuilder MIB;
13429
13430 // Transfer the remainder of BB and its successor edges to sinkMBB.
13431 sinkMBB->splice(Where: sinkMBB->begin(), Other: MBB,
13432 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
13433 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
13434
13435 // Note that the structure of the jmp_buf used here is not compatible
13436 // with that used by libc, and is not designed to be. Specifically, it
13437 // stores only those 'reserved' registers that LLVM does not otherwise
13438 // understand how to spill. Also, by convention, by the time this
13439 // intrinsic is called, Clang has already stored the frame address in the
13440 // first slot of the buffer and stack address in the third. Following the
13441 // X86 target code, we'll store the jump address in the second slot. We also
13442 // need to save the TOC pointer (R2) to handle jumps between shared
13443 // libraries, and that will be stored in the fourth slot. The thread
13444 // identifier (R13) is not affected.
13445
13446 // thisMBB:
13447 const int64_t LabelOffset = 1 * PVT.getStoreSize();
13448 const int64_t TOCOffset = 3 * PVT.getStoreSize();
13449 const int64_t BPOffset = 4 * PVT.getStoreSize();
13450
13451 // Prepare IP either in reg.
13452 const TargetRegisterClass *PtrRC = getRegClassFor(VT: PVT);
13453 Register LabelReg = MRI.createVirtualRegister(RegClass: PtrRC);
13454 Register BufReg = MI.getOperand(i: 1).getReg();
13455
13456 if (Subtarget.is64BitELFABI()) {
13457 setUsesTOCBasePtr(*MBB->getParent());
13458 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::STD))
13459 .addReg(RegNo: PPC::X2)
13460 .addImm(Val: TOCOffset)
13461 .addReg(RegNo: BufReg)
13462 .cloneMemRefs(OtherMI: MI);
13463 }
13464
13465 // Naked functions never have a base pointer, and so we use r1. For all
13466 // other functions, this decision must be delayed until during PEI.
13467 unsigned BaseReg;
13468 if (MF->getFunction().hasFnAttribute(Kind: Attribute::Naked))
13469 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
13470 else
13471 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
13472
13473 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL,
13474 MCID: TII->get(Opcode: Subtarget.isPPC64() ? PPC::STD : PPC::STW))
13475 .addReg(RegNo: BaseReg)
13476 .addImm(Val: BPOffset)
13477 .addReg(RegNo: BufReg)
13478 .cloneMemRefs(OtherMI: MI);
13479
13480 // Setup
13481 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::BCLalways)).addMBB(MBB: mainMBB);
13482 MIB.addRegMask(Mask: TRI->getNoPreservedMask());
13483
13484 BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LI), DestReg: restoreDstReg).addImm(Val: 1);
13485
13486 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::EH_SjLj_Setup))
13487 .addMBB(MBB: mainMBB);
13488 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: sinkMBB);
13489
13490 thisMBB->addSuccessor(Succ: mainMBB, Prob: BranchProbability::getZero());
13491 thisMBB->addSuccessor(Succ: sinkMBB, Prob: BranchProbability::getOne());
13492
13493 // mainMBB:
13494 // mainDstReg = 0
13495 MIB =
13496 BuildMI(BB: mainMBB, MIMD: DL,
13497 MCID: TII->get(Opcode: Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), DestReg: LabelReg);
13498
13499 // Store IP
13500 if (Subtarget.isPPC64()) {
13501 MIB = BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::STD))
13502 .addReg(RegNo: LabelReg)
13503 .addImm(Val: LabelOffset)
13504 .addReg(RegNo: BufReg);
13505 } else {
13506 MIB = BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::STW))
13507 .addReg(RegNo: LabelReg)
13508 .addImm(Val: LabelOffset)
13509 .addReg(RegNo: BufReg);
13510 }
13511 MIB.cloneMemRefs(OtherMI: MI);
13512
13513 BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::LI), DestReg: mainDstReg).addImm(Val: 0);
13514 mainMBB->addSuccessor(Succ: sinkMBB);
13515
13516 // sinkMBB:
13517 BuildMI(BB&: *sinkMBB, I: sinkMBB->begin(), MIMD: DL,
13518 MCID: TII->get(Opcode: PPC::PHI), DestReg: DstReg)
13519 .addReg(RegNo: mainDstReg).addMBB(MBB: mainMBB)
13520 .addReg(RegNo: restoreDstReg).addMBB(MBB: thisMBB);
13521
13522 MI.eraseFromParent();
13523 return sinkMBB;
13524}
13525
13526MachineBasicBlock *
13527PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
13528 MachineBasicBlock *MBB) const {
13529 DebugLoc DL = MI.getDebugLoc();
13530 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13531
13532 MachineFunction *MF = MBB->getParent();
13533 MachineRegisterInfo &MRI = MF->getRegInfo();
13534
13535 MVT PVT = getPointerTy(DL: MF->getDataLayout());
13536 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13537 "Invalid Pointer Size!");
13538
13539 const TargetRegisterClass *RC =
13540 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13541 Register Tmp = MRI.createVirtualRegister(RegClass: RC);
13542 // Since FP is only updated here but NOT referenced, it's treated as GPR.
13543 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
13544 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
13545 unsigned BP =
13546 (PVT == MVT::i64)
13547 ? PPC::X30
13548 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
13549 : PPC::R30);
13550
13551 MachineInstrBuilder MIB;
13552
13553 const int64_t LabelOffset = 1 * PVT.getStoreSize();
13554 const int64_t SPOffset = 2 * PVT.getStoreSize();
13555 const int64_t TOCOffset = 3 * PVT.getStoreSize();
13556 const int64_t BPOffset = 4 * PVT.getStoreSize();
13557
13558 Register BufReg = MI.getOperand(i: 0).getReg();
13559
13560 // Reload FP (the jumped-to function may not have had a
13561 // frame pointer, and if so, then its r31 will be restored
13562 // as necessary).
13563 if (PVT == MVT::i64) {
13564 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: FP)
13565 .addImm(Val: 0)
13566 .addReg(RegNo: BufReg);
13567 } else {
13568 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: FP)
13569 .addImm(Val: 0)
13570 .addReg(RegNo: BufReg);
13571 }
13572 MIB.cloneMemRefs(OtherMI: MI);
13573
13574 // Reload IP
13575 if (PVT == MVT::i64) {
13576 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: Tmp)
13577 .addImm(Val: LabelOffset)
13578 .addReg(RegNo: BufReg);
13579 } else {
13580 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: Tmp)
13581 .addImm(Val: LabelOffset)
13582 .addReg(RegNo: BufReg);
13583 }
13584 MIB.cloneMemRefs(OtherMI: MI);
13585
13586 // Reload SP
13587 if (PVT == MVT::i64) {
13588 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: SP)
13589 .addImm(Val: SPOffset)
13590 .addReg(RegNo: BufReg);
13591 } else {
13592 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: SP)
13593 .addImm(Val: SPOffset)
13594 .addReg(RegNo: BufReg);
13595 }
13596 MIB.cloneMemRefs(OtherMI: MI);
13597
13598 // Reload BP
13599 if (PVT == MVT::i64) {
13600 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: BP)
13601 .addImm(Val: BPOffset)
13602 .addReg(RegNo: BufReg);
13603 } else {
13604 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: BP)
13605 .addImm(Val: BPOffset)
13606 .addReg(RegNo: BufReg);
13607 }
13608 MIB.cloneMemRefs(OtherMI: MI);
13609
13610 // Reload TOC
13611 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
13612 setUsesTOCBasePtr(*MBB->getParent());
13613 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: PPC::X2)
13614 .addImm(Val: TOCOffset)
13615 .addReg(RegNo: BufReg)
13616 .cloneMemRefs(OtherMI: MI);
13617 }
13618
13619 // Jump
13620 BuildMI(BB&: *MBB, I&: MI, MIMD: DL,
13621 MCID: TII->get(Opcode: PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(RegNo: Tmp);
13622 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
13623
13624 MI.eraseFromParent();
13625 return MBB;
13626}
13627
13628bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
13629 // If the function specifically requests inline stack probes, emit them.
13630 if (MF.getFunction().hasFnAttribute(Kind: "probe-stack"))
13631 return MF.getFunction().getFnAttribute(Kind: "probe-stack").getValueAsString() ==
13632 "inline-asm";
13633 return false;
13634}
13635
13636unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const {
13637 const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
13638 unsigned StackAlign = TFI->getStackAlignment();
13639 assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
13640 "Unexpected stack alignment");
13641 // The default stack probe size is 4096 if the function has no
13642 // stack-probe-size attribute.
13643 const Function &Fn = MF.getFunction();
13644 unsigned StackProbeSize =
13645 Fn.getFnAttributeAsParsedInteger(Kind: "stack-probe-size", Default: 4096);
13646 // Round down to the stack alignment.
13647 StackProbeSize &= ~(StackAlign - 1);
13648 return StackProbeSize ? StackProbeSize : StackAlign;
13649}
13650
13651// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
13652// into three phases. In the first phase, it uses pseudo instruction
13653// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
13654// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
13655// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
13656// MaxCallFrameSize so that it can calculate correct data area pointer.
13657MachineBasicBlock *
13658PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
13659 MachineBasicBlock *MBB) const {
13660 const bool isPPC64 = Subtarget.isPPC64();
13661 MachineFunction *MF = MBB->getParent();
13662 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13663 DebugLoc DL = MI.getDebugLoc();
13664 const unsigned ProbeSize = getStackProbeSize(MF: *MF);
13665 const BasicBlock *ProbedBB = MBB->getBasicBlock();
13666 MachineRegisterInfo &MRI = MF->getRegInfo();
13667 // The CFG of probing stack looks as
13668 // +-----+
13669 // | MBB |
13670 // +--+--+
13671 // |
13672 // +----v----+
13673 // +--->+ TestMBB +---+
13674 // | +----+----+ |
13675 // | | |
13676 // | +-----v----+ |
13677 // +---+ BlockMBB | |
13678 // +----------+ |
13679 // |
13680 // +---------+ |
13681 // | TailMBB +<--+
13682 // +---------+
13683 // In MBB, calculate previous frame pointer and final stack pointer.
13684 // In TestMBB, test if sp is equal to final stack pointer, if so, jump to
13685 // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
13686 // TailMBB is spliced via \p MI.
13687 MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13688 MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13689 MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13690
13691 MachineFunction::iterator MBBIter = ++MBB->getIterator();
13692 MF->insert(MBBI: MBBIter, MBB: TestMBB);
13693 MF->insert(MBBI: MBBIter, MBB: BlockMBB);
13694 MF->insert(MBBI: MBBIter, MBB: TailMBB);
13695
13696 const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
13697 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13698
13699 Register DstReg = MI.getOperand(i: 0).getReg();
13700 Register NegSizeReg = MI.getOperand(i: 1).getReg();
13701 Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
13702 Register FinalStackPtr = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13703 Register FramePointer = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13704 Register ActualNegSizeReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13705
13706 // Since value of NegSizeReg might be realigned in prologepilog, insert a
13707 // PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
13708 // NegSize.
13709 unsigned ProbeOpc;
13710 if (!MRI.hasOneNonDBGUse(RegNo: NegSizeReg))
13711 ProbeOpc =
13712 isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
13713 else
13714 // By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
13715 // and NegSizeReg will be allocated in the same phyreg to avoid
13716 // redundant copy when NegSizeReg has only one use which is current MI and
13717 // will be replaced by PREPARE_PROBED_ALLOCA then.
13718 ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
13719 : PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
13720 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: ProbeOpc), DestReg: FramePointer)
13721 .addDef(RegNo: ActualNegSizeReg)
13722 .addReg(RegNo: NegSizeReg)
13723 .add(MO: MI.getOperand(i: 2))
13724 .add(MO: MI.getOperand(i: 3));
13725
13726 // Calculate final stack pointer, which equals to SP + ActualNegSize.
13727 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ADD8 : PPC::ADD4),
13728 DestReg: FinalStackPtr)
13729 .addReg(RegNo: SPReg)
13730 .addReg(RegNo: ActualNegSizeReg);
13731
13732 // Materialize a scratch register for update.
13733 int64_t NegProbeSize = -(int64_t)ProbeSize;
13734 assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
13735 Register ScratchReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13736 if (!isInt<16>(x: NegProbeSize)) {
13737 Register TempReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13738 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::LIS8 : PPC::LIS), DestReg: TempReg)
13739 .addImm(Val: NegProbeSize >> 16);
13740 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ORI8 : PPC::ORI),
13741 DestReg: ScratchReg)
13742 .addReg(RegNo: TempReg)
13743 .addImm(Val: NegProbeSize & 0xFFFF);
13744 } else
13745 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::LI8 : PPC::LI), DestReg: ScratchReg)
13746 .addImm(Val: NegProbeSize);
13747
13748 {
13749 // Probing leading residual part.
13750 Register Div = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13751 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::DIVD : PPC::DIVW), DestReg: Div)
13752 .addReg(RegNo: ActualNegSizeReg)
13753 .addReg(RegNo: ScratchReg);
13754 Register Mul = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13755 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::MULLD : PPC::MULLW), DestReg: Mul)
13756 .addReg(RegNo: Div)
13757 .addReg(RegNo: ScratchReg);
13758 Register NegMod = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13759 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::SUBF8 : PPC::SUBF), DestReg: NegMod)
13760 .addReg(RegNo: Mul)
13761 .addReg(RegNo: ActualNegSizeReg);
13762 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::STDUX : PPC::STWUX), DestReg: SPReg)
13763 .addReg(RegNo: FramePointer)
13764 .addReg(RegNo: SPReg)
13765 .addReg(RegNo: NegMod);
13766 }
13767
13768 {
13769 // Remaining part should be multiple of ProbeSize.
13770 Register CmpResult = MRI.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13771 BuildMI(BB: TestMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::CMPD : PPC::CMPW), DestReg: CmpResult)
13772 .addReg(RegNo: SPReg)
13773 .addReg(RegNo: FinalStackPtr);
13774 BuildMI(BB: TestMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::BCC))
13775 .addImm(Val: PPC::PRED_EQ)
13776 .addReg(RegNo: CmpResult)
13777 .addMBB(MBB: TailMBB);
13778 TestMBB->addSuccessor(Succ: BlockMBB);
13779 TestMBB->addSuccessor(Succ: TailMBB);
13780 }
13781
13782 {
13783 // Touch the block.
13784 // |P...|P...|P...
13785 BuildMI(BB: BlockMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::STDUX : PPC::STWUX), DestReg: SPReg)
13786 .addReg(RegNo: FramePointer)
13787 .addReg(RegNo: SPReg)
13788 .addReg(RegNo: ScratchReg);
13789 BuildMI(BB: BlockMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: TestMBB);
13790 BlockMBB->addSuccessor(Succ: TestMBB);
13791 }
13792
13793 // Calculation of MaxCallFrameSize is deferred to prologepilog, use
13794 // DYNAREAOFFSET pseudo instruction to get the future result.
13795 Register MaxCallFrameSizeReg =
13796 MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13797 BuildMI(BB: TailMBB, MIMD: DL,
13798 MCID: TII->get(Opcode: isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
13799 DestReg: MaxCallFrameSizeReg)
13800 .add(MO: MI.getOperand(i: 2))
13801 .add(MO: MI.getOperand(i: 3));
13802 BuildMI(BB: TailMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ADD8 : PPC::ADD4), DestReg: DstReg)
13803 .addReg(RegNo: SPReg)
13804 .addReg(RegNo: MaxCallFrameSizeReg);
13805
13806 // Splice instructions after MI to TailMBB.
13807 TailMBB->splice(Where: TailMBB->end(), Other: MBB,
13808 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
13809 TailMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
13810 MBB->addSuccessor(Succ: TestMBB);
13811
13812 // Delete the pseudo instruction.
13813 MI.eraseFromParent();
13814
13815 ++NumDynamicAllocaProbed;
13816 return TailMBB;
13817}
13818
13819static bool IsSelectCC(MachineInstr &MI) {
13820 switch (MI.getOpcode()) {
13821 case PPC::SELECT_CC_I4:
13822 case PPC::SELECT_CC_I8:
13823 case PPC::SELECT_CC_F4:
13824 case PPC::SELECT_CC_F8:
13825 case PPC::SELECT_CC_F16:
13826 case PPC::SELECT_CC_VRRC:
13827 case PPC::SELECT_CC_VSFRC:
13828 case PPC::SELECT_CC_VSSRC:
13829 case PPC::SELECT_CC_VSRC:
13830 case PPC::SELECT_CC_SPE4:
13831 case PPC::SELECT_CC_SPE:
13832 return true;
13833 default:
13834 return false;
13835 }
13836}
13837
13838static bool IsSelect(MachineInstr &MI) {
13839 switch (MI.getOpcode()) {
13840 case PPC::SELECT_I4:
13841 case PPC::SELECT_I8:
13842 case PPC::SELECT_F4:
13843 case PPC::SELECT_F8:
13844 case PPC::SELECT_F16:
13845 case PPC::SELECT_SPE:
13846 case PPC::SELECT_SPE4:
13847 case PPC::SELECT_VRRC:
13848 case PPC::SELECT_VSFRC:
13849 case PPC::SELECT_VSSRC:
13850 case PPC::SELECT_VSRC:
13851 return true;
13852 default:
13853 return false;
13854 }
13855}
13856
13857MachineBasicBlock *
13858PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
13859 MachineBasicBlock *BB) const {
13860 if (MI.getOpcode() == TargetOpcode::STACKMAP ||
13861 MI.getOpcode() == TargetOpcode::PATCHPOINT) {
13862 if (Subtarget.is64BitELFABI() &&
13863 MI.getOpcode() == TargetOpcode::PATCHPOINT &&
13864 !Subtarget.isUsingPCRelativeCalls()) {
13865 // Call lowering should have added an r2 operand to indicate a dependence
13866 // on the TOC base pointer value. It can't however, because there is no
13867 // way to mark the dependence as implicit there, and so the stackmap code
13868 // will confuse it with a regular operand. Instead, add the dependence
13869 // here.
13870 MI.addOperand(Op: MachineOperand::CreateReg(Reg: PPC::X2, isDef: false, isImp: true));
13871 }
13872
13873 return emitPatchPoint(MI, MBB: BB);
13874 }
13875
13876 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
13877 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
13878 return emitEHSjLjSetJmp(MI, MBB: BB);
13879 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
13880 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
13881 return emitEHSjLjLongJmp(MI, MBB: BB);
13882 }
13883
13884 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13885
13886 // To "insert" these instructions we actually have to insert their
13887 // control-flow patterns.
13888 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13889 MachineFunction::iterator It = ++BB->getIterator();
13890
13891 MachineFunction *F = BB->getParent();
13892 MachineRegisterInfo &MRI = F->getRegInfo();
13893
13894 if (Subtarget.hasISEL() &&
13895 (MI.getOpcode() == PPC::SELECT_CC_I4 ||
13896 MI.getOpcode() == PPC::SELECT_CC_I8 ||
13897 MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
13898 SmallVector<MachineOperand, 2> Cond;
13899 if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
13900 MI.getOpcode() == PPC::SELECT_CC_I8)
13901 Cond.push_back(Elt: MI.getOperand(i: 4));
13902 else
13903 Cond.push_back(Elt: MachineOperand::CreateImm(Val: PPC::PRED_BIT_SET));
13904 Cond.push_back(Elt: MI.getOperand(i: 1));
13905
13906 DebugLoc dl = MI.getDebugLoc();
13907 TII->insertSelect(MBB&: *BB, I: MI, DL: dl, DstReg: MI.getOperand(i: 0).getReg(), Cond,
13908 TrueReg: MI.getOperand(i: 2).getReg(), FalseReg: MI.getOperand(i: 3).getReg());
13909 } else if (IsSelectCC(MI) || IsSelect(MI)) {
13910 // The incoming instruction knows the destination vreg to set, the
13911 // condition code register to branch on, the true/false values to
13912 // select between, and a branch opcode to use.
13913
13914 // thisMBB:
13915 // ...
13916 // TrueVal = ...
13917 // cmpTY ccX, r1, r2
13918 // bCC sinkMBB
13919 // fallthrough --> copy0MBB
13920 MachineBasicBlock *thisMBB = BB;
13921 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13922 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13923 DebugLoc dl = MI.getDebugLoc();
13924 F->insert(MBBI: It, MBB: copy0MBB);
13925 F->insert(MBBI: It, MBB: sinkMBB);
13926
13927 if (isPhysRegUsedAfter(Reg: PPC::CARRY, MBI: MI.getIterator())) {
13928 copy0MBB->addLiveIn(PhysReg: PPC::CARRY);
13929 sinkMBB->addLiveIn(PhysReg: PPC::CARRY);
13930 }
13931
13932 // Set the call frame size on entry to the new basic blocks.
13933 // See https://reviews.llvm.org/D156113.
13934 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
13935 copy0MBB->setCallFrameSize(CallFrameSize);
13936 sinkMBB->setCallFrameSize(CallFrameSize);
13937
13938 // Transfer the remainder of BB and its successor edges to sinkMBB.
13939 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
13940 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13941 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13942
13943 // Next, add the true and fallthrough blocks as its successors.
13944 BB->addSuccessor(Succ: copy0MBB);
13945 BB->addSuccessor(Succ: sinkMBB);
13946
13947 if (IsSelect(MI)) {
13948 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BC))
13949 .addReg(RegNo: MI.getOperand(i: 1).getReg())
13950 .addMBB(MBB: sinkMBB);
13951 } else {
13952 unsigned SelectPred = MI.getOperand(i: 4).getImm();
13953 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13954 .addImm(Val: SelectPred)
13955 .addReg(RegNo: MI.getOperand(i: 1).getReg())
13956 .addMBB(MBB: sinkMBB);
13957 }
13958
13959 // copy0MBB:
13960 // %FalseValue = ...
13961 // # fallthrough to sinkMBB
13962 BB = copy0MBB;
13963
13964 // Update machine-CFG edges
13965 BB->addSuccessor(Succ: sinkMBB);
13966
13967 // sinkMBB:
13968 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
13969 // ...
13970 BB = sinkMBB;
13971 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::PHI), DestReg: MI.getOperand(i: 0).getReg())
13972 .addReg(RegNo: MI.getOperand(i: 3).getReg())
13973 .addMBB(MBB: copy0MBB)
13974 .addReg(RegNo: MI.getOperand(i: 2).getReg())
13975 .addMBB(MBB: thisMBB);
13976 } else if (MI.getOpcode() == PPC::ReadTB) {
13977 // To read the 64-bit time-base register on a 32-bit target, we read the
13978 // two halves. Should the counter have wrapped while it was being read, we
13979 // need to try again.
13980 // ...
13981 // readLoop:
13982 // mfspr Rx,TBU # load from TBU
13983 // mfspr Ry,TB # load from TB
13984 // mfspr Rz,TBU # load from TBU
13985 // cmpw crX,Rx,Rz # check if 'old'='new'
13986 // bne readLoop # branch if they're not equal
13987 // ...
13988
13989 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13990 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13991 DebugLoc dl = MI.getDebugLoc();
13992 F->insert(MBBI: It, MBB: readMBB);
13993 F->insert(MBBI: It, MBB: sinkMBB);
13994
13995 // Transfer the remainder of BB and its successor edges to sinkMBB.
13996 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
13997 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13998 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13999
14000 BB->addSuccessor(Succ: readMBB);
14001 BB = readMBB;
14002
14003 MachineRegisterInfo &RegInfo = F->getRegInfo();
14004 Register ReadAgainReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
14005 Register LoReg = MI.getOperand(i: 0).getReg();
14006 Register HiReg = MI.getOperand(i: 1).getReg();
14007
14008 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: HiReg).addImm(Val: 269);
14009 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: LoReg).addImm(Val: 268);
14010 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: ReadAgainReg).addImm(Val: 269);
14011
14012 Register CmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14013
14014 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::CMPW), DestReg: CmpReg)
14015 .addReg(RegNo: HiReg)
14016 .addReg(RegNo: ReadAgainReg);
14017 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14018 .addImm(Val: PPC::PRED_NE)
14019 .addReg(RegNo: CmpReg)
14020 .addMBB(MBB: readMBB);
14021
14022 BB->addSuccessor(Succ: readMBB);
14023 BB->addSuccessor(Succ: sinkMBB);
14024 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
14025 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::ADD4);
14026 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
14027 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::ADD4);
14028 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
14029 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::ADD4);
14030 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
14031 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::ADD8);
14032
14033 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
14034 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::AND);
14035 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
14036 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::AND);
14037 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
14038 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::AND);
14039 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
14040 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::AND8);
14041
14042 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
14043 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::OR);
14044 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
14045 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::OR);
14046 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
14047 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::OR);
14048 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
14049 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::OR8);
14050
14051 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
14052 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::XOR);
14053 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
14054 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::XOR);
14055 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
14056 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::XOR);
14057 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
14058 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::XOR8);
14059
14060 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
14061 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::NAND);
14062 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
14063 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::NAND);
14064 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
14065 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::NAND);
14066 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
14067 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::NAND8);
14068
14069 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
14070 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::SUBF);
14071 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
14072 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::SUBF);
14073 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
14074 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::SUBF);
14075 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
14076 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::SUBF8);
14077
14078 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
14079 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14080 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
14081 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14082 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
14083 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14084 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
14085 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPD, CmpPred: PPC::PRED_LT);
14086
14087 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
14088 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14089 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
14090 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14091 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
14092 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14093 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
14094 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPD, CmpPred: PPC::PRED_GT);
14095
14096 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
14097 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14098 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
14099 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14100 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
14101 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14102 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
14103 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPLD, CmpPred: PPC::PRED_LT);
14104
14105 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
14106 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14107 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
14108 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14109 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
14110 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14111 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
14112 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPLD, CmpPred: PPC::PRED_GT);
14113
14114 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
14115 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0);
14116 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
14117 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0);
14118 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
14119 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0);
14120 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
14121 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0);
14122 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
14123 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
14124 (Subtarget.hasPartwordAtomics() &&
14125 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
14126 (Subtarget.hasPartwordAtomics() &&
14127 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
14128 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
14129
14130 auto LoadMnemonic = PPC::LDARX;
14131 auto StoreMnemonic = PPC::STDCX;
14132 switch (MI.getOpcode()) {
14133 default:
14134 llvm_unreachable("Compare and swap of unknown size");
14135 case PPC::ATOMIC_CMP_SWAP_I8:
14136 LoadMnemonic = PPC::LBARX;
14137 StoreMnemonic = PPC::STBCX;
14138 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
14139 break;
14140 case PPC::ATOMIC_CMP_SWAP_I16:
14141 LoadMnemonic = PPC::LHARX;
14142 StoreMnemonic = PPC::STHCX;
14143 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
14144 break;
14145 case PPC::ATOMIC_CMP_SWAP_I32:
14146 LoadMnemonic = PPC::LWARX;
14147 StoreMnemonic = PPC::STWCX;
14148 break;
14149 case PPC::ATOMIC_CMP_SWAP_I64:
14150 LoadMnemonic = PPC::LDARX;
14151 StoreMnemonic = PPC::STDCX;
14152 break;
14153 }
14154 MachineRegisterInfo &RegInfo = F->getRegInfo();
14155 Register dest = MI.getOperand(i: 0).getReg();
14156 Register ptrA = MI.getOperand(i: 1).getReg();
14157 Register ptrB = MI.getOperand(i: 2).getReg();
14158 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14159 Register oldval = MI.getOperand(i: 3).getReg();
14160 Register newval = MI.getOperand(i: 4).getReg();
14161 DebugLoc dl = MI.getDebugLoc();
14162
14163 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14164 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14165 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14166 F->insert(MBBI: It, MBB: loop1MBB);
14167 F->insert(MBBI: It, MBB: loop2MBB);
14168 F->insert(MBBI: It, MBB: exitMBB);
14169 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
14170 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14171 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14172
14173 // thisMBB:
14174 // ...
14175 // fallthrough --> loopMBB
14176 BB->addSuccessor(Succ: loop1MBB);
14177
14178 // loop1MBB:
14179 // l[bhwd]arx dest, ptr
14180 // cmp[wd] dest, oldval
14181 // bne- exitBB
14182 // loop2MBB:
14183 // st[bhwd]cx. newval, ptr
14184 // bne- loopMBB
14185 // b exitBB
14186 // exitBB:
14187 BB = loop1MBB;
14188 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: LoadMnemonic), DestReg: dest).addReg(RegNo: ptrA).addReg(RegNo: ptrB);
14189 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::CMPD : PPC::CMPW), DestReg: CrReg)
14190 .addReg(RegNo: dest)
14191 .addReg(RegNo: oldval);
14192 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14193 .addImm(Val: PPC::PRED_NE_MINUS)
14194 .addReg(RegNo: CrReg)
14195 .addMBB(MBB: exitMBB);
14196 BB->addSuccessor(Succ: loop2MBB);
14197 BB->addSuccessor(Succ: exitMBB);
14198
14199 BB = loop2MBB;
14200 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: StoreMnemonic))
14201 .addReg(RegNo: newval)
14202 .addReg(RegNo: ptrA)
14203 .addReg(RegNo: ptrB);
14204 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14205 .addImm(Val: PPC::PRED_NE_MINUS)
14206 .addReg(RegNo: PPC::CR0)
14207 .addMBB(MBB: loop1MBB);
14208 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: exitMBB);
14209 BB->addSuccessor(Succ: loop1MBB);
14210 BB->addSuccessor(Succ: exitMBB);
14211
14212 // exitMBB:
14213 // ...
14214 BB = exitMBB;
14215 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
14216 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
14217 // We must use 64-bit registers for addresses when targeting 64-bit,
14218 // since we're actually doing arithmetic on them. Other registers
14219 // can be 32-bit.
14220 bool is64bit = Subtarget.isPPC64();
14221 bool isLittleEndian = Subtarget.isLittleEndian();
14222 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
14223
14224 Register dest = MI.getOperand(i: 0).getReg();
14225 Register ptrA = MI.getOperand(i: 1).getReg();
14226 Register ptrB = MI.getOperand(i: 2).getReg();
14227 Register oldval = MI.getOperand(i: 3).getReg();
14228 Register newval = MI.getOperand(i: 4).getReg();
14229 DebugLoc dl = MI.getDebugLoc();
14230
14231 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14232 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14233 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14234 F->insert(MBBI: It, MBB: loop1MBB);
14235 F->insert(MBBI: It, MBB: loop2MBB);
14236 F->insert(MBBI: It, MBB: exitMBB);
14237 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
14238 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14239 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14240
14241 MachineRegisterInfo &RegInfo = F->getRegInfo();
14242 const TargetRegisterClass *RC =
14243 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
14244 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
14245
14246 Register PtrReg = RegInfo.createVirtualRegister(RegClass: RC);
14247 Register Shift1Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14248 Register ShiftReg =
14249 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
14250 Register NewVal2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14251 Register NewVal3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14252 Register OldVal2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14253 Register OldVal3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14254 Register MaskReg = RegInfo.createVirtualRegister(RegClass: GPRC);
14255 Register Mask2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14256 Register Mask3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14257 Register Tmp2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14258 Register Tmp4Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14259 Register TmpDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
14260 Register Ptr1Reg;
14261 Register TmpReg = RegInfo.createVirtualRegister(RegClass: GPRC);
14262 Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
14263 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14264 // thisMBB:
14265 // ...
14266 // fallthrough --> loopMBB
14267 BB->addSuccessor(Succ: loop1MBB);
14268
14269 // The 4-byte load must be aligned, while a char or short may be
14270 // anywhere in the word. Hence all this nasty bookkeeping code.
14271 // add ptr1, ptrA, ptrB [copy if ptrA==0]
14272 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
14273 // xori shift, shift1, 24 [16]
14274 // rlwinm ptr, ptr1, 0, 0, 29
14275 // slw newval2, newval, shift
14276 // slw oldval2, oldval,shift
14277 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
14278 // slw mask, mask2, shift
14279 // and newval3, newval2, mask
14280 // and oldval3, oldval2, mask
14281 // loop1MBB:
14282 // lwarx tmpDest, ptr
14283 // and tmp, tmpDest, mask
14284 // cmpw tmp, oldval3
14285 // bne- exitBB
14286 // loop2MBB:
14287 // andc tmp2, tmpDest, mask
14288 // or tmp4, tmp2, newval3
14289 // stwcx. tmp4, ptr
14290 // bne- loop1MBB
14291 // b exitBB
14292 // exitBB:
14293 // srw dest, tmpDest, shift
14294 if (ptrA != ZeroReg) {
14295 Ptr1Reg = RegInfo.createVirtualRegister(RegClass: RC);
14296 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::ADD8 : PPC::ADD4), DestReg: Ptr1Reg)
14297 .addReg(RegNo: ptrA)
14298 .addReg(RegNo: ptrB);
14299 } else {
14300 Ptr1Reg = ptrB;
14301 }
14302
14303 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
14304 // mode.
14305 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: Shift1Reg)
14306 .addReg(RegNo: Ptr1Reg, Flags: {}, SubReg: is64bit ? PPC::sub_32 : 0)
14307 .addImm(Val: 3)
14308 .addImm(Val: 27)
14309 .addImm(Val: is8bit ? 28 : 27);
14310 if (!isLittleEndian)
14311 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::XORI), DestReg: ShiftReg)
14312 .addReg(RegNo: Shift1Reg)
14313 .addImm(Val: is8bit ? 24 : 16);
14314 if (is64bit)
14315 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDICR), DestReg: PtrReg)
14316 .addReg(RegNo: Ptr1Reg)
14317 .addImm(Val: 0)
14318 .addImm(Val: 61);
14319 else
14320 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: PtrReg)
14321 .addReg(RegNo: Ptr1Reg)
14322 .addImm(Val: 0)
14323 .addImm(Val: 0)
14324 .addImm(Val: 29);
14325 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: NewVal2Reg)
14326 .addReg(RegNo: newval)
14327 .addReg(RegNo: ShiftReg);
14328 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: OldVal2Reg)
14329 .addReg(RegNo: oldval)
14330 .addReg(RegNo: ShiftReg);
14331 if (is8bit)
14332 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask2Reg).addImm(Val: 255);
14333 else {
14334 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask3Reg).addImm(Val: 0);
14335 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ORI), DestReg: Mask2Reg)
14336 .addReg(RegNo: Mask3Reg)
14337 .addImm(Val: 65535);
14338 }
14339 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: MaskReg)
14340 .addReg(RegNo: Mask2Reg)
14341 .addReg(RegNo: ShiftReg);
14342 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: NewVal3Reg)
14343 .addReg(RegNo: NewVal2Reg)
14344 .addReg(RegNo: MaskReg);
14345 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: OldVal3Reg)
14346 .addReg(RegNo: OldVal2Reg)
14347 .addReg(RegNo: MaskReg);
14348
14349 BB = loop1MBB;
14350 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LWARX), DestReg: TmpDestReg)
14351 .addReg(RegNo: ZeroReg)
14352 .addReg(RegNo: PtrReg);
14353 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: TmpReg)
14354 .addReg(RegNo: TmpDestReg)
14355 .addReg(RegNo: MaskReg);
14356 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::CMPW), DestReg: CrReg)
14357 .addReg(RegNo: TmpReg)
14358 .addReg(RegNo: OldVal3Reg);
14359 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14360 .addImm(Val: PPC::PRED_NE)
14361 .addReg(RegNo: CrReg)
14362 .addMBB(MBB: exitMBB);
14363 BB->addSuccessor(Succ: loop2MBB);
14364 BB->addSuccessor(Succ: exitMBB);
14365
14366 BB = loop2MBB;
14367 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ANDC), DestReg: Tmp2Reg)
14368 .addReg(RegNo: TmpDestReg)
14369 .addReg(RegNo: MaskReg);
14370 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::OR), DestReg: Tmp4Reg)
14371 .addReg(RegNo: Tmp2Reg)
14372 .addReg(RegNo: NewVal3Reg);
14373 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::STWCX))
14374 .addReg(RegNo: Tmp4Reg)
14375 .addReg(RegNo: ZeroReg)
14376 .addReg(RegNo: PtrReg);
14377 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14378 .addImm(Val: PPC::PRED_NE)
14379 .addReg(RegNo: PPC::CR0)
14380 .addMBB(MBB: loop1MBB);
14381 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: exitMBB);
14382 BB->addSuccessor(Succ: loop1MBB);
14383 BB->addSuccessor(Succ: exitMBB);
14384
14385 // exitMBB:
14386 // ...
14387 BB = exitMBB;
14388 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: dest)
14389 .addReg(RegNo: TmpReg)
14390 .addReg(RegNo: ShiftReg);
14391 } else if (MI.getOpcode() == PPC::FADDrtz) {
14392 // This pseudo performs an FADD with rounding mode temporarily forced
14393 // to round-to-zero. We emit this via custom inserter since the FPSCR
14394 // is not modeled at the SelectionDAG level.
14395 Register Dest = MI.getOperand(i: 0).getReg();
14396 Register Src1 = MI.getOperand(i: 1).getReg();
14397 Register Src2 = MI.getOperand(i: 2).getReg();
14398 DebugLoc dl = MI.getDebugLoc();
14399
14400 MachineRegisterInfo &RegInfo = F->getRegInfo();
14401 Register MFFSReg = RegInfo.createVirtualRegister(RegClass: &PPC::F8RCRegClass);
14402
14403 // Save FPSCR value.
14404 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: MFFSReg);
14405
14406 // Set rounding mode to round-to-zero.
14407 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSB1))
14408 .addImm(Val: 31)
14409 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14410
14411 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSB0))
14412 .addImm(Val: 30)
14413 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14414
14415 // Perform addition.
14416 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::FADD), DestReg: Dest)
14417 .addReg(RegNo: Src1)
14418 .addReg(RegNo: Src2);
14419 if (MI.getFlag(Flag: MachineInstr::NoFPExcept))
14420 MIB.setMIFlag(MachineInstr::NoFPExcept);
14421
14422 // Restore FPSCR value.
14423 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSFb)).addImm(Val: 1).addReg(RegNo: MFFSReg);
14424 } else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
14425 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||
14426 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
14427 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
14428 unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
14429 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
14430 ? PPC::ANDI8_rec
14431 : PPC::ANDI_rec;
14432 bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
14433 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
14434
14435 MachineRegisterInfo &RegInfo = F->getRegInfo();
14436 Register Dest = RegInfo.createVirtualRegister(
14437 RegClass: Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
14438
14439 DebugLoc Dl = MI.getDebugLoc();
14440 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode), DestReg: Dest)
14441 .addReg(RegNo: MI.getOperand(i: 1).getReg())
14442 .addImm(Val: 1);
14443 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14444 DestReg: MI.getOperand(i: 0).getReg())
14445 .addReg(RegNo: IsEQ ? PPC::CR0EQ : PPC::CR0GT);
14446 } else if (MI.getOpcode() == PPC::TCHECK_RET) {
14447 DebugLoc Dl = MI.getDebugLoc();
14448 MachineRegisterInfo &RegInfo = F->getRegInfo();
14449 Register CRReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14450 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::TCHECK), DestReg: CRReg);
14451 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14452 DestReg: MI.getOperand(i: 0).getReg())
14453 .addReg(RegNo: CRReg);
14454 } else if (MI.getOpcode() == PPC::TBEGIN_RET) {
14455 DebugLoc Dl = MI.getDebugLoc();
14456 unsigned Imm = MI.getOperand(i: 1).getImm();
14457 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::TBEGIN)).addImm(Val: Imm);
14458 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14459 DestReg: MI.getOperand(i: 0).getReg())
14460 .addReg(RegNo: PPC::CR0EQ);
14461 } else if (MI.getOpcode() == PPC::SETRNDi) {
14462 DebugLoc dl = MI.getDebugLoc();
14463 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14464
14465 // Save FPSCR value.
14466 if (MRI.use_empty(RegNo: OldFPSCRReg))
14467 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: OldFPSCRReg);
14468 else
14469 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14470
14471 // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
14472 // the following settings:
14473 // 00 Round to nearest
14474 // 01 Round to 0
14475 // 10 Round to +inf
14476 // 11 Round to -inf
14477
14478 // When the operand is immediate, using the two least significant bits of
14479 // the immediate to set the bits 62:63 of FPSCR.
14480 unsigned Mode = MI.getOperand(i: 1).getImm();
14481 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: (Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
14482 .addImm(Val: 31)
14483 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14484
14485 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: (Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
14486 .addImm(Val: 30)
14487 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14488 } else if (MI.getOpcode() == PPC::SETRND) {
14489 DebugLoc dl = MI.getDebugLoc();
14490
14491 // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
14492 // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
14493 // If the target doesn't have DirectMove, we should use stack to do the
14494 // conversion, because the target doesn't have the instructions like mtvsrd
14495 // or mfvsrd to do this conversion directly.
14496 auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
14497 if (Subtarget.hasDirectMove()) {
14498 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg)
14499 .addReg(RegNo: SrcReg);
14500 } else {
14501 // Use stack to do the register copy.
14502 unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
14503 MachineRegisterInfo &RegInfo = F->getRegInfo();
14504 const TargetRegisterClass *RC = RegInfo.getRegClass(Reg: SrcReg);
14505 if (RC == &PPC::F8RCRegClass) {
14506 // Copy register from F8RCRegClass to G8RCRegclass.
14507 assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
14508 "Unsupported RegClass.");
14509
14510 StoreOp = PPC::STFD;
14511 LoadOp = PPC::LD;
14512 } else {
14513 // Copy register from G8RCRegClass to F8RCRegclass.
14514 assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
14515 (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
14516 "Unsupported RegClass.");
14517 }
14518
14519 MachineFrameInfo &MFI = F->getFrameInfo();
14520 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
14521
14522 MachineMemOperand *MMOStore = F->getMachineMemOperand(
14523 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *F, FI: FrameIdx, Offset: 0),
14524 F: MachineMemOperand::MOStore, Size: MFI.getObjectSize(ObjectIdx: FrameIdx),
14525 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIdx));
14526
14527 // Store the SrcReg into the stack.
14528 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: StoreOp))
14529 .addReg(RegNo: SrcReg)
14530 .addImm(Val: 0)
14531 .addFrameIndex(Idx: FrameIdx)
14532 .addMemOperand(MMO: MMOStore);
14533
14534 MachineMemOperand *MMOLoad = F->getMachineMemOperand(
14535 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *F, FI: FrameIdx, Offset: 0),
14536 F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIdx),
14537 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIdx));
14538
14539 // Load from the stack where SrcReg is stored, and save to DestReg,
14540 // so we have done the RegClass conversion from RegClass::SrcReg to
14541 // RegClass::DestReg.
14542 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: LoadOp), DestReg)
14543 .addImm(Val: 0)
14544 .addFrameIndex(Idx: FrameIdx)
14545 .addMemOperand(MMO: MMOLoad);
14546 }
14547 };
14548
14549 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14550
14551 // Save FPSCR value.
14552 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14553
14554 // When the operand is gprc register, use two least significant bits of the
14555 // register and mtfsf instruction to set the bits 62:63 of FPSCR.
14556 //
14557 // copy OldFPSCRTmpReg, OldFPSCRReg
14558 // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
14559 // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
14560 // copy NewFPSCRReg, NewFPSCRTmpReg
14561 // mtfsf 255, NewFPSCRReg
14562 MachineOperand SrcOp = MI.getOperand(i: 1);
14563 MachineRegisterInfo &RegInfo = F->getRegInfo();
14564 Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14565
14566 copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
14567
14568 Register ImDefReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14569 Register ExtSrcReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14570
14571 // The first operand of INSERT_SUBREG should be a register which has
14572 // subregisters, we only care about its RegClass, so we should use an
14573 // IMPLICIT_DEF register.
14574 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: ImDefReg);
14575 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::INSERT_SUBREG), DestReg: ExtSrcReg)
14576 .addReg(RegNo: ImDefReg)
14577 .add(MO: SrcOp)
14578 .addImm(Val: 1);
14579
14580 Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14581 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDIMI), DestReg: NewFPSCRTmpReg)
14582 .addReg(RegNo: OldFPSCRTmpReg)
14583 .addReg(RegNo: ExtSrcReg)
14584 .addImm(Val: 0)
14585 .addImm(Val: 62);
14586
14587 Register NewFPSCRReg = RegInfo.createVirtualRegister(RegClass: &PPC::F8RCRegClass);
14588 copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
14589
14590 // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
14591 // bits of FPSCR.
14592 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSF))
14593 .addImm(Val: 255)
14594 .addReg(RegNo: NewFPSCRReg)
14595 .addImm(Val: 0)
14596 .addImm(Val: 0);
14597 } else if (MI.getOpcode() == PPC::SETFLM) {
14598 DebugLoc Dl = MI.getDebugLoc();
14599
14600 // Result of setflm is previous FPSCR content, so we need to save it first.
14601 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14602 if (MRI.use_empty(RegNo: OldFPSCRReg))
14603 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: OldFPSCRReg);
14604 else
14605 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14606
14607 // Put bits in 32:63 to FPSCR.
14608 Register NewFPSCRReg = MI.getOperand(i: 1).getReg();
14609 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::MTFSF))
14610 .addImm(Val: 255)
14611 .addReg(RegNo: NewFPSCRReg)
14612 .addImm(Val: 0)
14613 .addImm(Val: 0);
14614 } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
14615 MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
14616 return emitProbedAlloca(MI, MBB: BB);
14617 } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {
14618 DebugLoc DL = MI.getDebugLoc();
14619 Register Src = MI.getOperand(i: 2).getReg();
14620 Register Lo = MI.getOperand(i: 0).getReg();
14621 Register Hi = MI.getOperand(i: 1).getReg();
14622 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY))
14623 .addDef(RegNo: Lo)
14624 .addUse(RegNo: Src, Flags: {}, SubReg: PPC::sub_gp8_x1);
14625 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY))
14626 .addDef(RegNo: Hi)
14627 .addUse(RegNo: Src, Flags: {}, SubReg: PPC::sub_gp8_x0);
14628 } else if (MI.getOpcode() == PPC::LQX_PSEUDO ||
14629 MI.getOpcode() == PPC::STQX_PSEUDO) {
14630 DebugLoc DL = MI.getDebugLoc();
14631 // Ptr is used as the ptr_rc_no_r0 part
14632 // of LQ/STQ's memory operand and adding result of RA and RB,
14633 // so it has to be g8rc_and_g8rc_nox0.
14634 Register Ptr =
14635 F->getRegInfo().createVirtualRegister(RegClass: &PPC::G8RC_and_G8RC_NOX0RegClass);
14636 Register Val = MI.getOperand(i: 0).getReg();
14637 Register RA = MI.getOperand(i: 1).getReg();
14638 Register RB = MI.getOperand(i: 2).getReg();
14639 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::ADD8), DestReg: Ptr).addReg(RegNo: RA).addReg(RegNo: RB);
14640 BuildMI(BB&: *BB, I&: MI, MIMD: DL,
14641 MCID: MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(Opcode: PPC::LQ)
14642 : TII->get(Opcode: PPC::STQ))
14643 .addReg(RegNo: Val, Flags: getDefRegState(B: MI.getOpcode() == PPC::LQX_PSEUDO))
14644 .addImm(Val: 0)
14645 .addReg(RegNo: Ptr);
14646 } else if (MI.getOpcode() == PPC::LWAT_PSEUDO ||
14647 MI.getOpcode() == PPC::LDAT_PSEUDO) {
14648 DebugLoc DL = MI.getDebugLoc();
14649 Register DstReg = MI.getOperand(i: 0).getReg();
14650 Register PtrReg = MI.getOperand(i: 1).getReg();
14651 Register ValReg = MI.getOperand(i: 2).getReg();
14652 unsigned FC = MI.getOperand(i: 3).getImm();
14653 bool IsLwat = MI.getOpcode() == PPC::LWAT_PSEUDO;
14654 Register Val64 = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14655 if (IsLwat)
14656 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::SUBREG_TO_REG), DestReg: Val64)
14657 .addReg(RegNo: ValReg)
14658 .addImm(Val: PPC::sub_32);
14659 else
14660 Val64 = ValReg;
14661
14662 Register G8rPair = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14663 Register UndefG8r = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14664 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: UndefG8r);
14665 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::REG_SEQUENCE), DestReg: G8rPair)
14666 .addReg(RegNo: UndefG8r)
14667 .addImm(Val: PPC::sub_gp8_x0)
14668 .addReg(RegNo: Val64)
14669 .addImm(Val: PPC::sub_gp8_x1);
14670
14671 Register PairResult = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14672 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: IsLwat ? PPC::LWAT : PPC::LDAT), DestReg: PairResult)
14673 .addReg(RegNo: G8rPair)
14674 .addReg(RegNo: PtrReg)
14675 .addImm(Val: FC);
14676 Register Result64 = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14677 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: Result64)
14678 .addReg(RegNo: PairResult, Flags: {}, SubReg: PPC::sub_gp8_x0);
14679 if (IsLwat)
14680 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14681 .addReg(RegNo: Result64, Flags: {}, SubReg: PPC::sub_32);
14682 else
14683 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14684 .addReg(RegNo: Result64);
14685 } else if (MI.getOpcode() == PPC::LWAT_COND_PSEUDO ||
14686 MI.getOpcode() == PPC::LDAT_COND_PSEUDO) {
14687 DebugLoc DL = MI.getDebugLoc();
14688 Register DstReg = MI.getOperand(i: 0).getReg();
14689 Register PtrReg = MI.getOperand(i: 1).getReg();
14690 unsigned FC = MI.getOperand(i: 2).getImm();
14691 bool IsLwat_Cond = MI.getOpcode() == PPC::LWAT_COND_PSEUDO;
14692
14693 Register Pair = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14694 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: Pair);
14695
14696 Register PairResult = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14697 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: IsLwat_Cond ? PPC::LWAT : PPC::LDAT),
14698 DestReg: PairResult)
14699 .addReg(RegNo: Pair)
14700 .addReg(RegNo: PtrReg)
14701 .addImm(Val: FC);
14702 Register Result64 = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14703 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: Result64)
14704 .addReg(RegNo: PairResult, Flags: {}, SubReg: PPC::sub_gp8_x0);
14705 if (IsLwat_Cond)
14706 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14707 .addReg(RegNo: Result64, Flags: {}, SubReg: PPC::sub_32);
14708 else
14709 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14710 .addReg(RegNo: Result64);
14711 } else {
14712 llvm_unreachable("Unexpected instr type to insert");
14713 }
14714
14715 MI.eraseFromParent(); // The pseudo instruction is gone now.
14716 return BB;
14717}
14718
14719//===----------------------------------------------------------------------===//
14720// Target Optimization Hooks
14721//===----------------------------------------------------------------------===//
14722
14723static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
14724 // For the estimates, convergence is quadratic, so we essentially double the
14725 // number of digits correct after every iteration. For both FRE and FRSQRTE,
14726 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
14727 // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
14728 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
14729 if (VT.getScalarType() == MVT::f64)
14730 RefinementSteps++;
14731 return RefinementSteps;
14732}
14733
14734SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
14735 const DenormalMode &Mode,
14736 SDNodeFlags Flags) const {
14737 // We only have VSX Vector Test for software Square Root.
14738 EVT VT = Op.getValueType();
14739 if (!isTypeLegal(VT: MVT::i1) ||
14740 (VT != MVT::f64 &&
14741 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
14742 return TargetLowering::getSqrtInputTest(Operand: Op, DAG, Mode, Flags);
14743
14744 SDLoc DL(Op);
14745 // The output register of FTSQRT is CR field.
14746 SDValue FTSQRT = DAG.getNode(Opcode: PPCISD::FTSQRT, DL, VT: MVT::i32, Operand: Op, Flags);
14747 // ftsqrt BF,FRB
14748 // Let e_b be the unbiased exponent of the double-precision
14749 // floating-point operand in register FRB.
14750 // fe_flag is set to 1 if either of the following conditions occurs.
14751 // - The double-precision floating-point operand in register FRB is a zero,
14752 // a NaN, or an infinity, or a negative value.
14753 // - e_b is less than or equal to -970.
14754 // Otherwise fe_flag is set to 0.
14755 // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
14756 // not eligible for iteration. (zero/negative/infinity/nan or unbiased
14757 // exponent is less than -970)
14758 SDValue SRIdxVal = DAG.getTargetConstant(Val: PPC::sub_eq, DL, VT: MVT::i32);
14759 return SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: DL, VT: MVT::i1,
14760 Op1: FTSQRT, Op2: SRIdxVal),
14761 0);
14762}
14763
14764SDValue
14765PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
14766 SelectionDAG &DAG) const {
14767 // We only have VSX Vector Square Root.
14768 EVT VT = Op.getValueType();
14769 if (VT != MVT::f64 &&
14770 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
14771 return TargetLowering::getSqrtResultForDenormInput(Operand: Op, DAG);
14772
14773 return DAG.getNode(Opcode: PPCISD::FSQRT, DL: SDLoc(Op), VT, Operand: Op);
14774}
14775
14776SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
14777 int Enabled, int &RefinementSteps,
14778 bool &UseOneConstNR,
14779 bool Reciprocal) const {
14780 EVT VT = Operand.getValueType();
14781 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
14782 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
14783 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
14784 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
14785 if (RefinementSteps == ReciprocalEstimate::Unspecified)
14786 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
14787
14788 // The Newton-Raphson computation with a single constant does not provide
14789 // enough accuracy on some CPUs.
14790 UseOneConstNR = !Subtarget.needsTwoConstNR();
14791 return DAG.getNode(Opcode: PPCISD::FRSQRTE, DL: SDLoc(Operand), VT, Operand);
14792 }
14793 return SDValue();
14794}
14795
14796SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
14797 int Enabled,
14798 int &RefinementSteps) const {
14799 EVT VT = Operand.getValueType();
14800 if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
14801 (VT == MVT::f64 && Subtarget.hasFRE()) ||
14802 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
14803 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
14804 if (RefinementSteps == ReciprocalEstimate::Unspecified)
14805 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
14806 return DAG.getNode(Opcode: PPCISD::FRE, DL: SDLoc(Operand), VT, Operand);
14807 }
14808 return SDValue();
14809}
14810
14811unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
14812 // Note: This functionality is used only when arcp is enabled, and
14813 // on cores with reciprocal estimates (which are used when arcp is
14814 // enabled for division), this functionality is redundant with the default
14815 // combiner logic (once the division -> reciprocal/multiply transformation
14816 // has taken place). As a result, this matters more for older cores than for
14817 // newer ones.
14818
14819 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
14820 // reciprocal if there are two or more FDIVs (for embedded cores with only
14821 // one FP pipeline) for three or more FDIVs (for generic OOO cores).
14822 switch (Subtarget.getCPUDirective()) {
14823 default:
14824 return 3;
14825 case PPC::DIR_440:
14826 case PPC::DIR_A2:
14827 case PPC::DIR_E500:
14828 case PPC::DIR_E500mc:
14829 case PPC::DIR_E5500:
14830 return 2;
14831 }
14832}
14833
14834// isConsecutiveLSLoc needs to work even if all adds have not yet been
14835// collapsed, and so we need to look through chains of them.
14836static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
14837 int64_t& Offset, SelectionDAG &DAG) {
14838 if (DAG.isBaseWithConstantOffset(Op: Loc)) {
14839 Base = Loc.getOperand(i: 0);
14840 Offset += cast<ConstantSDNode>(Val: Loc.getOperand(i: 1))->getSExtValue();
14841
14842 // The base might itself be a base plus an offset, and if so, accumulate
14843 // that as well.
14844 getBaseWithConstantOffset(Loc: Loc.getOperand(i: 0), Base, Offset, DAG);
14845 }
14846}
14847
14848static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
14849 unsigned Bytes, int Dist,
14850 SelectionDAG &DAG) {
14851 if (VT.getSizeInBits() / 8 != Bytes)
14852 return false;
14853
14854 SDValue BaseLoc = Base->getBasePtr();
14855 if (Loc.getOpcode() == ISD::FrameIndex) {
14856 if (BaseLoc.getOpcode() != ISD::FrameIndex)
14857 return false;
14858 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14859 int FI = cast<FrameIndexSDNode>(Val&: Loc)->getIndex();
14860 int BFI = cast<FrameIndexSDNode>(Val&: BaseLoc)->getIndex();
14861 int FS = MFI.getObjectSize(ObjectIdx: FI);
14862 int BFS = MFI.getObjectSize(ObjectIdx: BFI);
14863 if (FS != BFS || FS != (int)Bytes) return false;
14864 return MFI.getObjectOffset(ObjectIdx: FI) == (MFI.getObjectOffset(ObjectIdx: BFI) + Dist*Bytes);
14865 }
14866
14867 SDValue Base1 = Loc, Base2 = BaseLoc;
14868 int64_t Offset1 = 0, Offset2 = 0;
14869 getBaseWithConstantOffset(Loc, Base&: Base1, Offset&: Offset1, DAG);
14870 getBaseWithConstantOffset(Loc: BaseLoc, Base&: Base2, Offset&: Offset2, DAG);
14871 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
14872 return true;
14873
14874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14875 const GlobalValue *GV1 = nullptr;
14876 const GlobalValue *GV2 = nullptr;
14877 Offset1 = 0;
14878 Offset2 = 0;
14879 bool isGA1 = TLI.isGAPlusOffset(N: Loc.getNode(), GA&: GV1, Offset&: Offset1);
14880 bool isGA2 = TLI.isGAPlusOffset(N: BaseLoc.getNode(), GA&: GV2, Offset&: Offset2);
14881 if (isGA1 && isGA2 && GV1 == GV2)
14882 return Offset1 == (Offset2 + Dist*Bytes);
14883 return false;
14884}
14885
14886// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
14887// not enforce equality of the chain operands.
14888static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
14889 unsigned Bytes, int Dist,
14890 SelectionDAG &DAG) {
14891 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Val: N)) {
14892 EVT VT = LS->getMemoryVT();
14893 SDValue Loc = LS->getBasePtr();
14894 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
14895 }
14896
14897 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
14898 EVT VT;
14899 switch (N->getConstantOperandVal(Num: 1)) {
14900 default: return false;
14901 case Intrinsic::ppc_altivec_lvx:
14902 case Intrinsic::ppc_altivec_lvxl:
14903 case Intrinsic::ppc_vsx_lxvw4x:
14904 case Intrinsic::ppc_vsx_lxvw4x_be:
14905 VT = MVT::v4i32;
14906 break;
14907 case Intrinsic::ppc_vsx_lxvd2x:
14908 case Intrinsic::ppc_vsx_lxvd2x_be:
14909 VT = MVT::v2f64;
14910 break;
14911 case Intrinsic::ppc_altivec_lvebx:
14912 VT = MVT::i8;
14913 break;
14914 case Intrinsic::ppc_altivec_lvehx:
14915 VT = MVT::i16;
14916 break;
14917 case Intrinsic::ppc_altivec_lvewx:
14918 VT = MVT::i32;
14919 break;
14920 }
14921
14922 return isConsecutiveLSLoc(Loc: N->getOperand(Num: 2), VT, Base, Bytes, Dist, DAG);
14923 }
14924
14925 if (N->getOpcode() == ISD::INTRINSIC_VOID) {
14926 EVT VT;
14927 switch (N->getConstantOperandVal(Num: 1)) {
14928 default: return false;
14929 case Intrinsic::ppc_altivec_stvx:
14930 case Intrinsic::ppc_altivec_stvxl:
14931 case Intrinsic::ppc_vsx_stxvw4x:
14932 VT = MVT::v4i32;
14933 break;
14934 case Intrinsic::ppc_vsx_stxvd2x:
14935 VT = MVT::v2f64;
14936 break;
14937 case Intrinsic::ppc_vsx_stxvw4x_be:
14938 VT = MVT::v4i32;
14939 break;
14940 case Intrinsic::ppc_vsx_stxvd2x_be:
14941 VT = MVT::v2f64;
14942 break;
14943 case Intrinsic::ppc_altivec_stvebx:
14944 VT = MVT::i8;
14945 break;
14946 case Intrinsic::ppc_altivec_stvehx:
14947 VT = MVT::i16;
14948 break;
14949 case Intrinsic::ppc_altivec_stvewx:
14950 VT = MVT::i32;
14951 break;
14952 }
14953
14954 return isConsecutiveLSLoc(Loc: N->getOperand(Num: 3), VT, Base, Bytes, Dist, DAG);
14955 }
14956
14957 return false;
14958}
14959
14960// Return true is there is a nearyby consecutive load to the one provided
14961// (regardless of alignment). We search up and down the chain, looking though
14962// token factors and other loads (but nothing else). As a result, a true result
14963// indicates that it is safe to create a new consecutive load adjacent to the
14964// load provided.
14965static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
14966 SDValue Chain = LD->getChain();
14967 EVT VT = LD->getMemoryVT();
14968
14969 SmallPtrSet<SDNode *, 16> LoadRoots;
14970 SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
14971 SmallPtrSet<SDNode *, 16> Visited;
14972
14973 // First, search up the chain, branching to follow all token-factor operands.
14974 // If we find a consecutive load, then we're done, otherwise, record all
14975 // nodes just above the top-level loads and token factors.
14976 while (!Queue.empty()) {
14977 SDNode *ChainNext = Queue.pop_back_val();
14978 if (!Visited.insert(Ptr: ChainNext).second)
14979 continue;
14980
14981 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(Val: ChainNext)) {
14982 if (isConsecutiveLS(N: ChainLD, Base: LD, Bytes: VT.getStoreSize(), Dist: 1, DAG))
14983 return true;
14984
14985 if (!Visited.count(Ptr: ChainLD->getChain().getNode()))
14986 Queue.push_back(Elt: ChainLD->getChain().getNode());
14987 } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
14988 for (const SDUse &O : ChainNext->ops())
14989 if (!Visited.count(Ptr: O.getNode()))
14990 Queue.push_back(Elt: O.getNode());
14991 } else
14992 LoadRoots.insert(Ptr: ChainNext);
14993 }
14994
14995 // Second, search down the chain, starting from the top-level nodes recorded
14996 // in the first phase. These top-level nodes are the nodes just above all
14997 // loads and token factors. Starting with their uses, recursively look though
14998 // all loads (just the chain uses) and token factors to find a consecutive
14999 // load.
15000 Visited.clear();
15001 Queue.clear();
15002
15003 for (SDNode *I : LoadRoots) {
15004 Queue.push_back(Elt: I);
15005
15006 while (!Queue.empty()) {
15007 SDNode *LoadRoot = Queue.pop_back_val();
15008 if (!Visited.insert(Ptr: LoadRoot).second)
15009 continue;
15010
15011 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(Val: LoadRoot))
15012 if (isConsecutiveLS(N: ChainLD, Base: LD, Bytes: VT.getStoreSize(), Dist: 1, DAG))
15013 return true;
15014
15015 for (SDNode *U : LoadRoot->users())
15016 if (((isa<MemSDNode>(Val: U) &&
15017 cast<MemSDNode>(Val: U)->getChain().getNode() == LoadRoot) ||
15018 U->getOpcode() == ISD::TokenFactor) &&
15019 !Visited.count(Ptr: U))
15020 Queue.push_back(Elt: U);
15021 }
15022 }
15023
15024 return false;
15025}
15026
15027/// This function is called when we have proved that a SETCC node can be replaced
15028/// by subtraction (and other supporting instructions) so that the result of
15029/// comparison is kept in a GPR instead of CR. This function is purely for
15030/// codegen purposes and has some flags to guide the codegen process.
15031static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
15032 bool Swap, SDLoc &DL, SelectionDAG &DAG) {
15033 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
15034
15035 // Zero extend the operands to the largest legal integer. Originally, they
15036 // must be of a strictly smaller size.
15037 auto Op0 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, N1: N->getOperand(Num: 0),
15038 N2: DAG.getConstant(Val: Size, DL, VT: MVT::i32));
15039 auto Op1 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, N1: N->getOperand(Num: 1),
15040 N2: DAG.getConstant(Val: Size, DL, VT: MVT::i32));
15041
15042 // Swap if needed. Depends on the condition code.
15043 if (Swap)
15044 std::swap(a&: Op0, b&: Op1);
15045
15046 // Subtract extended integers.
15047 auto SubNode = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Op0, N2: Op1);
15048
15049 // Move the sign bit to the least significant position and zero out the rest.
15050 // Now the least significant bit carries the result of original comparison.
15051 auto Shifted = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SubNode,
15052 N2: DAG.getConstant(Val: Size - 1, DL, VT: MVT::i32));
15053 auto Final = Shifted;
15054
15055 // Complement the result if needed. Based on the condition code.
15056 if (Complement)
15057 Final = DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64, N1: Shifted,
15058 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
15059
15060 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Final);
15061}
15062
15063SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
15064 DAGCombinerInfo &DCI) const {
15065 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
15066
15067 SelectionDAG &DAG = DCI.DAG;
15068 SDLoc DL(N);
15069
15070 // Size of integers being compared has a critical role in the following
15071 // analysis, so we prefer to do this when all types are legal.
15072 if (!DCI.isAfterLegalizeDAG())
15073 return SDValue();
15074
15075 // If all users of SETCC extend its value to a legal integer type
15076 // then we replace SETCC with a subtraction
15077 for (const SDNode *U : N->users())
15078 if (U->getOpcode() != ISD::ZERO_EXTEND)
15079 return SDValue();
15080
15081 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15082 auto OpSize = N->getOperand(Num: 0).getValueSizeInBits();
15083
15084 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
15085
15086 if (OpSize < Size) {
15087 switch (CC) {
15088 default: break;
15089 case ISD::SETULT:
15090 return generateEquivalentSub(N, Size, Complement: false, Swap: false, DL, DAG);
15091 case ISD::SETULE:
15092 return generateEquivalentSub(N, Size, Complement: true, Swap: true, DL, DAG);
15093 case ISD::SETUGT:
15094 return generateEquivalentSub(N, Size, Complement: false, Swap: true, DL, DAG);
15095 case ISD::SETUGE:
15096 return generateEquivalentSub(N, Size, Complement: true, Swap: false, DL, DAG);
15097 }
15098 }
15099
15100 return SDValue();
15101}
15102
15103SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
15104 DAGCombinerInfo &DCI) const {
15105 SelectionDAG &DAG = DCI.DAG;
15106 SDLoc dl(N);
15107
15108 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
15109 // If we're tracking CR bits, we need to be careful that we don't have:
15110 // trunc(binary-ops(zext(x), zext(y)))
15111 // or
15112 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
15113 // such that we're unnecessarily moving things into GPRs when it would be
15114 // better to keep them in CR bits.
15115
15116 // Note that trunc here can be an actual i1 trunc, or can be the effective
15117 // truncation that comes from a setcc or select_cc.
15118 if (N->getOpcode() == ISD::TRUNCATE &&
15119 N->getValueType(ResNo: 0) != MVT::i1)
15120 return SDValue();
15121
15122 if (N->getOperand(Num: 0).getValueType() != MVT::i32 &&
15123 N->getOperand(Num: 0).getValueType() != MVT::i64)
15124 return SDValue();
15125
15126 if (N->getOpcode() == ISD::SETCC ||
15127 N->getOpcode() == ISD::SELECT_CC) {
15128 // If we're looking at a comparison, then we need to make sure that the
15129 // high bits (all except for the first) don't matter the result.
15130 ISD::CondCode CC =
15131 cast<CondCodeSDNode>(Val: N->getOperand(
15132 Num: N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
15133 unsigned OpBits = N->getOperand(Num: 0).getValueSizeInBits();
15134
15135 if (ISD::isSignedIntSetCC(Code: CC)) {
15136 if (DAG.ComputeNumSignBits(Op: N->getOperand(Num: 0)) != OpBits ||
15137 DAG.ComputeNumSignBits(Op: N->getOperand(Num: 1)) != OpBits)
15138 return SDValue();
15139 } else if (ISD::isUnsignedIntSetCC(Code: CC)) {
15140 if (!DAG.MaskedValueIsZero(Op: N->getOperand(Num: 0),
15141 Mask: APInt::getHighBitsSet(numBits: OpBits, hiBitsSet: OpBits-1)) ||
15142 !DAG.MaskedValueIsZero(Op: N->getOperand(Num: 1),
15143 Mask: APInt::getHighBitsSet(numBits: OpBits, hiBitsSet: OpBits-1)))
15144 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
15145 : SDValue());
15146 } else {
15147 // This is neither a signed nor an unsigned comparison, just make sure
15148 // that the high bits are equal.
15149 KnownBits Op1Known = DAG.computeKnownBits(Op: N->getOperand(Num: 0));
15150 KnownBits Op2Known = DAG.computeKnownBits(Op: N->getOperand(Num: 1));
15151
15152 // We don't really care about what is known about the first bit (if
15153 // anything), so pretend that it is known zero for both to ensure they can
15154 // be compared as constants.
15155 Op1Known.Zero.setBit(0); Op1Known.One.clearBit(BitPosition: 0);
15156 Op2Known.Zero.setBit(0); Op2Known.One.clearBit(BitPosition: 0);
15157
15158 if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
15159 Op1Known.getConstant() != Op2Known.getConstant())
15160 return SDValue();
15161 }
15162 }
15163
15164 // We now know that the higher-order bits are irrelevant, we just need to
15165 // make sure that all of the intermediate operations are bit operations, and
15166 // all inputs are extensions.
15167 if (N->getOperand(Num: 0).getOpcode() != ISD::AND &&
15168 N->getOperand(Num: 0).getOpcode() != ISD::OR &&
15169 N->getOperand(Num: 0).getOpcode() != ISD::XOR &&
15170 N->getOperand(Num: 0).getOpcode() != ISD::SELECT &&
15171 N->getOperand(Num: 0).getOpcode() != ISD::SELECT_CC &&
15172 N->getOperand(Num: 0).getOpcode() != ISD::TRUNCATE &&
15173 N->getOperand(Num: 0).getOpcode() != ISD::SIGN_EXTEND &&
15174 N->getOperand(Num: 0).getOpcode() != ISD::ZERO_EXTEND &&
15175 N->getOperand(Num: 0).getOpcode() != ISD::ANY_EXTEND)
15176 return SDValue();
15177
15178 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
15179 N->getOperand(Num: 1).getOpcode() != ISD::AND &&
15180 N->getOperand(Num: 1).getOpcode() != ISD::OR &&
15181 N->getOperand(Num: 1).getOpcode() != ISD::XOR &&
15182 N->getOperand(Num: 1).getOpcode() != ISD::SELECT &&
15183 N->getOperand(Num: 1).getOpcode() != ISD::SELECT_CC &&
15184 N->getOperand(Num: 1).getOpcode() != ISD::TRUNCATE &&
15185 N->getOperand(Num: 1).getOpcode() != ISD::SIGN_EXTEND &&
15186 N->getOperand(Num: 1).getOpcode() != ISD::ZERO_EXTEND &&
15187 N->getOperand(Num: 1).getOpcode() != ISD::ANY_EXTEND)
15188 return SDValue();
15189
15190 SmallVector<SDValue, 4> Inputs;
15191 SmallVector<SDValue, 8> BinOps, PromOps;
15192 SmallPtrSet<SDNode *, 16> Visited;
15193
15194 for (unsigned i = 0; i < 2; ++i) {
15195 if (((N->getOperand(Num: i).getOpcode() == ISD::SIGN_EXTEND ||
15196 N->getOperand(Num: i).getOpcode() == ISD::ZERO_EXTEND ||
15197 N->getOperand(Num: i).getOpcode() == ISD::ANY_EXTEND) &&
15198 N->getOperand(Num: i).getOperand(i: 0).getValueType() == MVT::i1) ||
15199 isa<ConstantSDNode>(Val: N->getOperand(Num: i)))
15200 Inputs.push_back(Elt: N->getOperand(Num: i));
15201 else
15202 BinOps.push_back(Elt: N->getOperand(Num: i));
15203
15204 if (N->getOpcode() == ISD::TRUNCATE)
15205 break;
15206 }
15207
15208 // Visit all inputs, collect all binary operations (and, or, xor and
15209 // select) that are all fed by extensions.
15210 while (!BinOps.empty()) {
15211 SDValue BinOp = BinOps.pop_back_val();
15212
15213 if (!Visited.insert(Ptr: BinOp.getNode()).second)
15214 continue;
15215
15216 PromOps.push_back(Elt: BinOp);
15217
15218 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
15219 // The condition of the select is not promoted.
15220 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
15221 continue;
15222 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
15223 continue;
15224
15225 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
15226 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
15227 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
15228 BinOp.getOperand(i).getOperand(i: 0).getValueType() == MVT::i1) ||
15229 isa<ConstantSDNode>(Val: BinOp.getOperand(i))) {
15230 Inputs.push_back(Elt: BinOp.getOperand(i));
15231 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
15232 BinOp.getOperand(i).getOpcode() == ISD::OR ||
15233 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
15234 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
15235 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
15236 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
15237 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
15238 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
15239 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
15240 BinOps.push_back(Elt: BinOp.getOperand(i));
15241 } else {
15242 // We have an input that is not an extension or another binary
15243 // operation; we'll abort this transformation.
15244 return SDValue();
15245 }
15246 }
15247 }
15248
15249 // Make sure that this is a self-contained cluster of operations (which
15250 // is not quite the same thing as saying that everything has only one
15251 // use).
15252 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15253 if (isa<ConstantSDNode>(Val: Inputs[i]))
15254 continue;
15255
15256 for (const SDNode *User : Inputs[i].getNode()->users()) {
15257 if (User != N && !Visited.count(Ptr: User))
15258 return SDValue();
15259
15260 // Make sure that we're not going to promote the non-output-value
15261 // operand(s) or SELECT or SELECT_CC.
15262 // FIXME: Although we could sometimes handle this, and it does occur in
15263 // practice that one of the condition inputs to the select is also one of
15264 // the outputs, we currently can't deal with this.
15265 if (User->getOpcode() == ISD::SELECT) {
15266 if (User->getOperand(Num: 0) == Inputs[i])
15267 return SDValue();
15268 } else if (User->getOpcode() == ISD::SELECT_CC) {
15269 if (User->getOperand(Num: 0) == Inputs[i] ||
15270 User->getOperand(Num: 1) == Inputs[i])
15271 return SDValue();
15272 }
15273 }
15274 }
15275
15276 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
15277 for (const SDNode *User : PromOps[i].getNode()->users()) {
15278 if (User != N && !Visited.count(Ptr: User))
15279 return SDValue();
15280
15281 // Make sure that we're not going to promote the non-output-value
15282 // operand(s) or SELECT or SELECT_CC.
15283 // FIXME: Although we could sometimes handle this, and it does occur in
15284 // practice that one of the condition inputs to the select is also one of
15285 // the outputs, we currently can't deal with this.
15286 if (User->getOpcode() == ISD::SELECT) {
15287 if (User->getOperand(Num: 0) == PromOps[i])
15288 return SDValue();
15289 } else if (User->getOpcode() == ISD::SELECT_CC) {
15290 if (User->getOperand(Num: 0) == PromOps[i] ||
15291 User->getOperand(Num: 1) == PromOps[i])
15292 return SDValue();
15293 }
15294 }
15295 }
15296
15297 // Replace all inputs with the extension operand.
15298 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15299 // Constants may have users outside the cluster of to-be-promoted nodes,
15300 // and so we need to replace those as we do the promotions.
15301 if (isa<ConstantSDNode>(Val: Inputs[i]))
15302 continue;
15303 else
15304 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i], To: Inputs[i].getOperand(i: 0));
15305 }
15306
15307 std::list<HandleSDNode> PromOpHandles;
15308 for (auto &PromOp : PromOps)
15309 PromOpHandles.emplace_back(args&: PromOp);
15310
15311 // Replace all operations (these are all the same, but have a different
15312 // (i1) return type). DAG.getNode will validate that the types of
15313 // a binary operator match, so go through the list in reverse so that
15314 // we've likely promoted both operands first. Any intermediate truncations or
15315 // extensions disappear.
15316 while (!PromOpHandles.empty()) {
15317 SDValue PromOp = PromOpHandles.back().getValue();
15318 PromOpHandles.pop_back();
15319
15320 if (PromOp.getOpcode() == ISD::TRUNCATE ||
15321 PromOp.getOpcode() == ISD::SIGN_EXTEND ||
15322 PromOp.getOpcode() == ISD::ZERO_EXTEND ||
15323 PromOp.getOpcode() == ISD::ANY_EXTEND) {
15324 if (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: 0)) &&
15325 PromOp.getOperand(i: 0).getValueType() != MVT::i1) {
15326 // The operand is not yet ready (see comment below).
15327 PromOpHandles.emplace_front(args&: PromOp);
15328 continue;
15329 }
15330
15331 SDValue RepValue = PromOp.getOperand(i: 0);
15332 if (isa<ConstantSDNode>(Val: RepValue))
15333 RepValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: RepValue);
15334
15335 DAG.ReplaceAllUsesOfValueWith(From: PromOp, To: RepValue);
15336 continue;
15337 }
15338
15339 unsigned C;
15340 switch (PromOp.getOpcode()) {
15341 default: C = 0; break;
15342 case ISD::SELECT: C = 1; break;
15343 case ISD::SELECT_CC: C = 2; break;
15344 }
15345
15346 if ((!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C)) &&
15347 PromOp.getOperand(i: C).getValueType() != MVT::i1) ||
15348 (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C+1)) &&
15349 PromOp.getOperand(i: C+1).getValueType() != MVT::i1)) {
15350 // The to-be-promoted operands of this node have not yet been
15351 // promoted (this should be rare because we're going through the
15352 // list backward, but if one of the operands has several users in
15353 // this cluster of to-be-promoted nodes, it is possible).
15354 PromOpHandles.emplace_front(args&: PromOp);
15355 continue;
15356 }
15357
15358 SmallVector<SDValue, 3> Ops(PromOp.getNode()->ops());
15359
15360 // If there are any constant inputs, make sure they're replaced now.
15361 for (unsigned i = 0; i < 2; ++i)
15362 if (isa<ConstantSDNode>(Val: Ops[C+i]))
15363 Ops[C+i] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: Ops[C+i]);
15364
15365 DAG.ReplaceAllUsesOfValueWith(From: PromOp,
15366 To: DAG.getNode(Opcode: PromOp.getOpcode(), DL: dl, VT: MVT::i1, Ops));
15367 }
15368
15369 // Now we're left with the initial truncation itself.
15370 if (N->getOpcode() == ISD::TRUNCATE)
15371 return N->getOperand(Num: 0);
15372
15373 // Otherwise, this is a comparison. The operands to be compared have just
15374 // changed type (to i1), but everything else is the same.
15375 return SDValue(N, 0);
15376}
15377
15378SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
15379 DAGCombinerInfo &DCI) const {
15380 SelectionDAG &DAG = DCI.DAG;
15381 SDLoc dl(N);
15382
15383 // If we're tracking CR bits, we need to be careful that we don't have:
15384 // zext(binary-ops(trunc(x), trunc(y)))
15385 // or
15386 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
15387 // such that we're unnecessarily moving things into CR bits that can more
15388 // efficiently stay in GPRs. Note that if we're not certain that the high
15389 // bits are set as required by the final extension, we still may need to do
15390 // some masking to get the proper behavior.
15391
15392 // This same functionality is important on PPC64 when dealing with
15393 // 32-to-64-bit extensions; these occur often when 32-bit values are used as
15394 // the return values of functions. Because it is so similar, it is handled
15395 // here as well.
15396
15397 if (N->getValueType(ResNo: 0) != MVT::i32 &&
15398 N->getValueType(ResNo: 0) != MVT::i64)
15399 return SDValue();
15400
15401 if (!((N->getOperand(Num: 0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
15402 (N->getOperand(Num: 0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
15403 return SDValue();
15404
15405 if (N->getOperand(Num: 0).getOpcode() != ISD::AND &&
15406 N->getOperand(Num: 0).getOpcode() != ISD::OR &&
15407 N->getOperand(Num: 0).getOpcode() != ISD::XOR &&
15408 N->getOperand(Num: 0).getOpcode() != ISD::SELECT &&
15409 N->getOperand(Num: 0).getOpcode() != ISD::SELECT_CC)
15410 return SDValue();
15411
15412 SmallVector<SDValue, 4> Inputs;
15413 SmallVector<SDValue, 8> BinOps(1, N->getOperand(Num: 0)), PromOps;
15414 SmallPtrSet<SDNode *, 16> Visited;
15415
15416 // Visit all inputs, collect all binary operations (and, or, xor and
15417 // select) that are all fed by truncations.
15418 while (!BinOps.empty()) {
15419 SDValue BinOp = BinOps.pop_back_val();
15420
15421 if (!Visited.insert(Ptr: BinOp.getNode()).second)
15422 continue;
15423
15424 PromOps.push_back(Elt: BinOp);
15425
15426 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
15427 // The condition of the select is not promoted.
15428 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
15429 continue;
15430 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
15431 continue;
15432
15433 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
15434 isa<ConstantSDNode>(Val: BinOp.getOperand(i))) {
15435 Inputs.push_back(Elt: BinOp.getOperand(i));
15436 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
15437 BinOp.getOperand(i).getOpcode() == ISD::OR ||
15438 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
15439 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
15440 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
15441 BinOps.push_back(Elt: BinOp.getOperand(i));
15442 } else {
15443 // We have an input that is not a truncation or another binary
15444 // operation; we'll abort this transformation.
15445 return SDValue();
15446 }
15447 }
15448 }
15449
15450 // The operands of a select that must be truncated when the select is
15451 // promoted because the operand is actually part of the to-be-promoted set.
15452 DenseMap<SDNode *, EVT> SelectTruncOp[2];
15453
15454 // Make sure that this is a self-contained cluster of operations (which
15455 // is not quite the same thing as saying that everything has only one
15456 // use).
15457 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15458 if (isa<ConstantSDNode>(Val: Inputs[i]))
15459 continue;
15460
15461 for (SDNode *User : Inputs[i].getNode()->users()) {
15462 if (User != N && !Visited.count(Ptr: User))
15463 return SDValue();
15464
15465 // If we're going to promote the non-output-value operand(s) or SELECT or
15466 // SELECT_CC, record them for truncation.
15467 if (User->getOpcode() == ISD::SELECT) {
15468 if (User->getOperand(Num: 0) == Inputs[i])
15469 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15470 y: User->getOperand(Num: 0).getValueType()));
15471 } else if (User->getOpcode() == ISD::SELECT_CC) {
15472 if (User->getOperand(Num: 0) == Inputs[i])
15473 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15474 y: User->getOperand(Num: 0).getValueType()));
15475 if (User->getOperand(Num: 1) == Inputs[i])
15476 SelectTruncOp[1].insert(KV: std::make_pair(x&: User,
15477 y: User->getOperand(Num: 1).getValueType()));
15478 }
15479 }
15480 }
15481
15482 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
15483 for (SDNode *User : PromOps[i].getNode()->users()) {
15484 if (User != N && !Visited.count(Ptr: User))
15485 return SDValue();
15486
15487 // If we're going to promote the non-output-value operand(s) or SELECT or
15488 // SELECT_CC, record them for truncation.
15489 if (User->getOpcode() == ISD::SELECT) {
15490 if (User->getOperand(Num: 0) == PromOps[i])
15491 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15492 y: User->getOperand(Num: 0).getValueType()));
15493 } else if (User->getOpcode() == ISD::SELECT_CC) {
15494 if (User->getOperand(Num: 0) == PromOps[i])
15495 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15496 y: User->getOperand(Num: 0).getValueType()));
15497 if (User->getOperand(Num: 1) == PromOps[i])
15498 SelectTruncOp[1].insert(KV: std::make_pair(x&: User,
15499 y: User->getOperand(Num: 1).getValueType()));
15500 }
15501 }
15502 }
15503
15504 unsigned PromBits = N->getOperand(Num: 0).getValueSizeInBits();
15505 bool ReallyNeedsExt = false;
15506 if (N->getOpcode() != ISD::ANY_EXTEND) {
15507 // If all of the inputs are not already sign/zero extended, then
15508 // we'll still need to do that at the end.
15509 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15510 if (isa<ConstantSDNode>(Val: Inputs[i]))
15511 continue;
15512
15513 unsigned OpBits =
15514 Inputs[i].getOperand(i: 0).getValueSizeInBits();
15515 assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
15516
15517 if ((N->getOpcode() == ISD::ZERO_EXTEND &&
15518 !DAG.MaskedValueIsZero(Op: Inputs[i].getOperand(i: 0),
15519 Mask: APInt::getHighBitsSet(numBits: OpBits,
15520 hiBitsSet: OpBits-PromBits))) ||
15521 (N->getOpcode() == ISD::SIGN_EXTEND &&
15522 DAG.ComputeNumSignBits(Op: Inputs[i].getOperand(i: 0)) <
15523 (OpBits-(PromBits-1)))) {
15524 ReallyNeedsExt = true;
15525 break;
15526 }
15527 }
15528 }
15529
15530 // Convert PromOps to handles before doing any RAUW operations, as these
15531 // may CSE with existing nodes, deleting the originals.
15532 std::list<HandleSDNode> PromOpHandles;
15533 for (auto &PromOp : PromOps)
15534 PromOpHandles.emplace_back(args&: PromOp);
15535
15536 // Replace all inputs, either with the truncation operand, or a
15537 // truncation or extension to the final output type.
15538 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15539 // Constant inputs need to be replaced with the to-be-promoted nodes that
15540 // use them because they might have users outside of the cluster of
15541 // promoted nodes.
15542 if (isa<ConstantSDNode>(Val: Inputs[i]))
15543 continue;
15544
15545 SDValue InSrc = Inputs[i].getOperand(i: 0);
15546 if (Inputs[i].getValueType() == N->getValueType(ResNo: 0))
15547 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i], To: InSrc);
15548 else if (N->getOpcode() == ISD::SIGN_EXTEND)
15549 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15550 To: DAG.getSExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15551 else if (N->getOpcode() == ISD::ZERO_EXTEND)
15552 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15553 To: DAG.getZExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15554 else
15555 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15556 To: DAG.getAnyExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15557 }
15558
15559 // Replace all operations (these are all the same, but have a different
15560 // (promoted) return type). DAG.getNode will validate that the types of
15561 // a binary operator match, so go through the list in reverse so that
15562 // we've likely promoted both operands first.
15563 while (!PromOpHandles.empty()) {
15564 SDValue PromOp = PromOpHandles.back().getValue();
15565 PromOpHandles.pop_back();
15566
15567 unsigned C;
15568 switch (PromOp.getOpcode()) {
15569 default: C = 0; break;
15570 case ISD::SELECT: C = 1; break;
15571 case ISD::SELECT_CC: C = 2; break;
15572 }
15573
15574 if ((!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C)) &&
15575 PromOp.getOperand(i: C).getValueType() != N->getValueType(ResNo: 0)) ||
15576 (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C+1)) &&
15577 PromOp.getOperand(i: C+1).getValueType() != N->getValueType(ResNo: 0))) {
15578 // The to-be-promoted operands of this node have not yet been
15579 // promoted (this should be rare because we're going through the
15580 // list backward, but if one of the operands has several users in
15581 // this cluster of to-be-promoted nodes, it is possible).
15582 PromOpHandles.emplace_front(args&: PromOp);
15583 continue;
15584 }
15585
15586 // For SELECT and SELECT_CC nodes, we do a similar check for any
15587 // to-be-promoted comparison inputs.
15588 if (PromOp.getOpcode() == ISD::SELECT ||
15589 PromOp.getOpcode() == ISD::SELECT_CC) {
15590 if ((SelectTruncOp[0].count(Val: PromOp.getNode()) &&
15591 PromOp.getOperand(i: 0).getValueType() != N->getValueType(ResNo: 0)) ||
15592 (SelectTruncOp[1].count(Val: PromOp.getNode()) &&
15593 PromOp.getOperand(i: 1).getValueType() != N->getValueType(ResNo: 0))) {
15594 PromOpHandles.emplace_front(args&: PromOp);
15595 continue;
15596 }
15597 }
15598
15599 SmallVector<SDValue, 3> Ops(PromOp.getNode()->ops());
15600
15601 // If this node has constant inputs, then they'll need to be promoted here.
15602 for (unsigned i = 0; i < 2; ++i) {
15603 if (!isa<ConstantSDNode>(Val: Ops[C+i]))
15604 continue;
15605 if (Ops[C+i].getValueType() == N->getValueType(ResNo: 0))
15606 continue;
15607
15608 if (N->getOpcode() == ISD::SIGN_EXTEND)
15609 Ops[C+i] = DAG.getSExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15610 else if (N->getOpcode() == ISD::ZERO_EXTEND)
15611 Ops[C+i] = DAG.getZExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15612 else
15613 Ops[C+i] = DAG.getAnyExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15614 }
15615
15616 // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
15617 // truncate them again to the original value type.
15618 if (PromOp.getOpcode() == ISD::SELECT ||
15619 PromOp.getOpcode() == ISD::SELECT_CC) {
15620 auto SI0 = SelectTruncOp[0].find(Val: PromOp.getNode());
15621 if (SI0 != SelectTruncOp[0].end())
15622 Ops[0] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: SI0->second, Operand: Ops[0]);
15623 auto SI1 = SelectTruncOp[1].find(Val: PromOp.getNode());
15624 if (SI1 != SelectTruncOp[1].end())
15625 Ops[1] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: SI1->second, Operand: Ops[1]);
15626 }
15627
15628 DAG.ReplaceAllUsesOfValueWith(From: PromOp,
15629 To: DAG.getNode(Opcode: PromOp.getOpcode(), DL: dl, VT: N->getValueType(ResNo: 0), Ops));
15630 }
15631
15632 // Now we're left with the initial extension itself.
15633 if (!ReallyNeedsExt)
15634 return N->getOperand(Num: 0);
15635
15636 // To zero extend, just mask off everything except for the first bit (in the
15637 // i1 case).
15638 if (N->getOpcode() == ISD::ZERO_EXTEND)
15639 return DAG.getNode(Opcode: ISD::AND, DL: dl, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
15640 N2: DAG.getConstant(Val: APInt::getLowBitsSet(
15641 numBits: N->getValueSizeInBits(ResNo: 0), loBitsSet: PromBits),
15642 DL: dl, VT: N->getValueType(ResNo: 0)));
15643
15644 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
15645 "Invalid extension type");
15646 EVT ShiftAmountTy = getShiftAmountTy(LHSTy: N->getValueType(ResNo: 0), DL: DAG.getDataLayout());
15647 SDValue ShiftCst =
15648 DAG.getConstant(Val: N->getValueSizeInBits(ResNo: 0) - PromBits, DL: dl, VT: ShiftAmountTy);
15649 return DAG.getNode(
15650 Opcode: ISD::SRA, DL: dl, VT: N->getValueType(ResNo: 0),
15651 N1: DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0), N2: ShiftCst),
15652 N2: ShiftCst);
15653}
15654
15655// The function check a i128 load can convert to 16i8 load for Vcmpequb.
15656static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS) {
15657
15658 auto isValidForConvert = [](SDValue &Operand) {
15659 if (!Operand.hasOneUse())
15660 return false;
15661
15662 if (Operand.getValueType() != MVT::i128)
15663 return false;
15664
15665 if (Operand.getOpcode() == ISD::Constant)
15666 return true;
15667
15668 auto *LoadNode = dyn_cast<LoadSDNode>(Val&: Operand);
15669 if (!LoadNode)
15670 return false;
15671
15672 // If memory operation is volatile, do not perform any
15673 // optimization or transformation. Volatile operations must be preserved
15674 // as written to ensure correct program behavior, so we return an empty
15675 // SDValue to indicate no action.
15676
15677 if (LoadNode->isVolatile())
15678 return false;
15679
15680 // Only combine loads if both use the unindexed addressing mode.
15681 // PowerPC AltiVec/VMX does not support vector loads or stores with
15682 // pre/post-increment addressing. Indexed modes may imply implicit
15683 // pointer updates, which are not compatible with AltiVec vector
15684 // instructions.
15685 if (LoadNode->getAddressingMode() != ISD::UNINDEXED)
15686 return false;
15687
15688 // Only combine loads if both are non-extending loads
15689 // (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or
15690 // ISD::SEXTLOAD) perform zero or sign extension, which may change the
15691 // loaded value's semantics and are not compatible with vector loads.
15692 if (LoadNode->getExtensionType() != ISD::NON_EXTLOAD)
15693 return false;
15694
15695 return true;
15696 };
15697
15698 return (isValidForConvert(LHS) && isValidForConvert(RHS));
15699}
15700
15701SDValue convertTwoLoadsAndCmpToVCMPEQUB(SelectionDAG &DAG, SDNode *N,
15702 const SDLoc &DL) {
15703
15704 assert(N->getOpcode() == ISD::SETCC && "Should be called with a SETCC node");
15705
15706 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15707 assert((CC == ISD::SETNE || CC == ISD::SETEQ) &&
15708 "CC mus be ISD::SETNE or ISD::SETEQ");
15709
15710 auto getV16i8Load = [&](const SDValue &Operand) {
15711 if (Operand.getOpcode() == ISD::Constant)
15712 return DAG.getBitcast(VT: MVT::v16i8, V: Operand);
15713
15714 assert(Operand.getOpcode() == ISD::LOAD && "Must be LoadSDNode here.");
15715
15716 auto *LoadNode = cast<LoadSDNode>(Val: Operand);
15717 return DAG.getLoad(VT: MVT::v16i8, dl: DL, Chain: LoadNode->getChain(),
15718 Ptr: LoadNode->getBasePtr(), MMO: LoadNode->getMemOperand());
15719 };
15720
15721 // Following code transforms the DAG
15722 // t0: ch,glue = EntryToken
15723 // t2: i64,ch = CopyFromReg t0, Register:i64 %0
15724 // t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
15725 // undef:i64
15726 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
15727 // t5: i128,ch =
15728 // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 =
15729 // setcc t3, t5, setne:ch
15730 //
15731 // ---->
15732 //
15733 // t0: ch,glue = EntryToken
15734 // t2: i64,ch = CopyFromReg t0, Register:i64 %0
15735 // t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
15736 // undef:i64
15737 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
15738 // t5: v16i8,ch =
15739 // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64
15740 // t6: i32 =
15741 // llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>,
15742 // Constant:i32<2>, t3, t5
15743 // t7: i1 = setcc t6, Constant:i32<0>, seteq:ch
15744
15745 // Or transforms the DAG
15746 // t5: i128,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
15747 // t8: i1 =
15748 // setcc Constant:i128<237684487579686500932345921536>, t5, setne:ch
15749 //
15750 // --->
15751 //
15752 // t5: v16i8,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
15753 // t6: v16i8 = bitcast Constant:i128<237684487579686500932345921536>
15754 // t7: i32 =
15755 // llvm.ppc.altivec.vcmpequb.p Constant:i32<10962>, Constant:i32<2>, t5, t2
15756
15757 SDValue LHSVec = getV16i8Load(N->getOperand(Num: 0));
15758 SDValue RHSVec = getV16i8Load(N->getOperand(Num: 1));
15759
15760 SDValue IntrID =
15761 DAG.getConstant(Val: Intrinsic::ppc_altivec_vcmpequb_p, DL, VT: MVT::i32);
15762 SDValue CRSel = DAG.getConstant(Val: 2, DL, VT: MVT::i32); // which CR6 predicate field
15763 SDValue PredResult = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
15764 N1: IntrID, N2: CRSel, N3: LHSVec, N4: RHSVec);
15765 // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
15766 // so we need to invert the CC opcode.
15767 return DAG.getSetCC(DL, VT: N->getValueType(ResNo: 0), LHS: PredResult,
15768 RHS: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
15769 Cond: CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
15770}
15771
15772// Detect whether there is a pattern like (setcc (and X, 1), 0, eq).
15773// If it is , return true; otherwise return false.
15774static bool canConvertSETCCToXori(SDNode *N) {
15775 assert(N->getOpcode() == ISD::SETCC && "Should be SETCC SDNode here.");
15776
15777 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15778 if (CC != ISD::SETEQ)
15779 return false;
15780
15781 SDValue LHS = N->getOperand(Num: 0);
15782 SDValue RHS = N->getOperand(Num: 1);
15783
15784 // Check the `SDValue &V` is from `and` with `1`.
15785 auto IsAndWithOne = [](SDValue &V) {
15786 if (V.getOpcode() == ISD::AND) {
15787 for (const SDValue &Op : V->ops())
15788 if (auto *C = dyn_cast<ConstantSDNode>(Val: Op))
15789 if (C->isOne())
15790 return true;
15791 }
15792 return false;
15793 };
15794
15795 // Check whether the SETCC compare with zero.
15796 auto IsCompareWithZero = [](SDValue &V) {
15797 if (auto *C = dyn_cast<ConstantSDNode>(Val&: V))
15798 if (C->isZero())
15799 return true;
15800 return false;
15801 };
15802
15803 return (IsAndWithOne(LHS) && IsCompareWithZero(RHS)) ||
15804 (IsAndWithOne(RHS) && IsCompareWithZero(LHS));
15805}
15806
15807// You must check whether the `SDNode* N` can be converted to Xori using
15808// the function `static bool canConvertSETCCToXori(SDNode *N)`
15809// before calling the function; otherwise, it may produce incorrect results.
15810static SDValue ConvertSETCCToXori(SDNode *N, SelectionDAG &DAG) {
15811
15812 assert(N->getOpcode() == ISD::SETCC && "Should be SETCC SDNode here.");
15813 SDValue LHS = N->getOperand(Num: 0);
15814 SDValue RHS = N->getOperand(Num: 1);
15815 SDLoc DL(N);
15816
15817 [[maybe_unused]] ISD::CondCode CC =
15818 cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15819 assert((CC == ISD::SETEQ) && "CC must be ISD::SETEQ.");
15820 // Rewrite it as XORI (and X, 1), 1.
15821 auto MakeXor1 = [&](SDValue V) {
15822 EVT VT = V.getValueType();
15823 SDValue One = DAG.getConstant(Val: 1, DL, VT);
15824 SDValue Xor = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: V, N2: One);
15825 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Xor);
15826 };
15827
15828 if (LHS.getOpcode() == ISD::AND && RHS.getOpcode() != ISD::AND)
15829 return MakeXor1(LHS);
15830
15831 if (RHS.getOpcode() == ISD::AND && LHS.getOpcode() != ISD::AND)
15832 return MakeXor1(RHS);
15833
15834 llvm_unreachable("Should not reach here.");
15835}
15836
15837SDValue PPCTargetLowering::combineSetCC(SDNode *N,
15838 DAGCombinerInfo &DCI) const {
15839 assert(N->getOpcode() == ISD::SETCC &&
15840 "Should be called with a SETCC node");
15841
15842 // Check if the pattern (setcc (and X, 1), 0, eq) is present.
15843 // If it is, rewrite it as XORI (and X, 1), 1.
15844 if (canConvertSETCCToXori(N))
15845 return ConvertSETCCToXori(N, DAG&: DCI.DAG);
15846
15847 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15848 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
15849 SDValue LHS = N->getOperand(Num: 0);
15850 SDValue RHS = N->getOperand(Num: 1);
15851
15852 // If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
15853 if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
15854 LHS.hasOneUse())
15855 std::swap(a&: LHS, b&: RHS);
15856
15857 // x == 0-y --> x+y == 0
15858 // x != 0-y --> x+y != 0
15859 if (RHS.getOpcode() == ISD::SUB && isNullConstant(V: RHS.getOperand(i: 0)) &&
15860 RHS.hasOneUse()) {
15861 SDLoc DL(N);
15862 SelectionDAG &DAG = DCI.DAG;
15863 EVT VT = N->getValueType(ResNo: 0);
15864 EVT OpVT = LHS.getValueType();
15865 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: OpVT, N1: LHS, N2: RHS.getOperand(i: 1));
15866 return DAG.getSetCC(DL, VT, LHS: Add, RHS: DAG.getConstant(Val: 0, DL, VT: OpVT), Cond: CC);
15867 }
15868
15869 // Optimization: Fold i128 equality/inequality compares of two loads into a
15870 // vectorized compare using vcmpequb.p when Altivec is available.
15871 //
15872 // Rationale:
15873 // A scalar i128 SETCC (eq/ne) normally lowers to multiple scalar ops.
15874 // On VSX-capable subtargets, we can instead reinterpret the i128 loads
15875 // as v16i8 vectors and use the Altive vcmpequb.p instruction to
15876 // perform a full 128-bit equality check in a single vector compare.
15877 //
15878 // Example Result:
15879 // This transformation replaces memcmp(a, b, 16) with two vector loads
15880 // and one vector compare instruction.
15881
15882 if (Subtarget.hasAltivec() && canConvertToVcmpequb(LHS, RHS))
15883 return convertTwoLoadsAndCmpToVCMPEQUB(DAG&: DCI.DAG, N, DL: SDLoc(N));
15884 }
15885
15886 return DAGCombineTruncBoolExt(N, DCI);
15887}
15888
15889// Is this an extending load from an f32 to an f64?
15890static bool isFPExtLoad(SDValue Op) {
15891 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: Op.getNode()))
15892 return LD->getExtensionType() == ISD::EXTLOAD &&
15893 Op.getValueType() == MVT::f64;
15894 return false;
15895}
15896
15897/// Reduces the number of fp-to-int conversion when building a vector.
15898///
15899/// If this vector is built out of floating to integer conversions,
15900/// transform it to a vector built out of floating point values followed by a
15901/// single floating to integer conversion of the vector.
15902/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
15903/// becomes (fptosi (build_vector ($A, $B, ...)))
15904SDValue PPCTargetLowering::
15905combineElementTruncationToVectorTruncation(SDNode *N,
15906 DAGCombinerInfo &DCI) const {
15907 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
15908 "Should be called with a BUILD_VECTOR node");
15909
15910 SelectionDAG &DAG = DCI.DAG;
15911 SDLoc dl(N);
15912
15913 SDValue FirstInput = N->getOperand(Num: 0);
15914 assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
15915 "The input operand must be an fp-to-int conversion.");
15916
15917 // This combine happens after legalization so the fp_to_[su]i nodes are
15918 // already converted to PPCSISD nodes.
15919 unsigned FirstConversion = FirstInput.getOperand(i: 0).getOpcode();
15920 if (FirstConversion == PPCISD::FCTIDZ ||
15921 FirstConversion == PPCISD::FCTIDUZ ||
15922 FirstConversion == PPCISD::FCTIWZ ||
15923 FirstConversion == PPCISD::FCTIWUZ) {
15924 bool IsSplat = true;
15925 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
15926 FirstConversion == PPCISD::FCTIWUZ;
15927 EVT SrcVT = FirstInput.getOperand(i: 0).getValueType();
15928 SmallVector<SDValue, 4> Ops;
15929 EVT TargetVT = N->getValueType(ResNo: 0);
15930 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
15931 SDValue NextOp = N->getOperand(Num: i);
15932 if (NextOp.getOpcode() != PPCISD::MFVSR)
15933 return SDValue();
15934 unsigned NextConversion = NextOp.getOperand(i: 0).getOpcode();
15935 if (NextConversion != FirstConversion)
15936 return SDValue();
15937 // If we are converting to 32-bit integers, we need to add an FP_ROUND.
15938 // This is not valid if the input was originally double precision. It is
15939 // also not profitable to do unless this is an extending load in which
15940 // case doing this combine will allow us to combine consecutive loads.
15941 if (Is32Bit && !isFPExtLoad(Op: NextOp.getOperand(i: 0).getOperand(i: 0)))
15942 return SDValue();
15943 if (N->getOperand(Num: i) != FirstInput)
15944 IsSplat = false;
15945 }
15946
15947 // If this is a splat, we leave it as-is since there will be only a single
15948 // fp-to-int conversion followed by a splat of the integer. This is better
15949 // for 32-bit and smaller ints and neutral for 64-bit ints.
15950 if (IsSplat)
15951 return SDValue();
15952
15953 // Now that we know we have the right type of node, get its operands
15954 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
15955 SDValue In = N->getOperand(Num: i).getOperand(i: 0);
15956 if (Is32Bit) {
15957 // For 32-bit values, we need to add an FP_ROUND node (if we made it
15958 // here, we know that all inputs are extending loads so this is safe).
15959 if (In.isUndef())
15960 Ops.push_back(Elt: DAG.getUNDEF(VT: SrcVT));
15961 else {
15962 SDValue Trunc =
15963 DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: In.getOperand(i: 0),
15964 N2: DAG.getIntPtrConstant(Val: 1, DL: dl, /*isTarget=*/true));
15965 Ops.push_back(Elt: Trunc);
15966 }
15967 } else
15968 Ops.push_back(Elt: In.isUndef() ? DAG.getUNDEF(VT: SrcVT) : In.getOperand(i: 0));
15969 }
15970
15971 unsigned Opcode;
15972 if (FirstConversion == PPCISD::FCTIDZ ||
15973 FirstConversion == PPCISD::FCTIWZ)
15974 Opcode = ISD::FP_TO_SINT;
15975 else
15976 Opcode = ISD::FP_TO_UINT;
15977
15978 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
15979 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: dl, Ops);
15980 return DAG.getNode(Opcode, DL: dl, VT: TargetVT, Operand: BV);
15981 }
15982 return SDValue();
15983}
15984
15985// LXVKQ instruction load VSX vector with a special quadword value
15986// based on an immediate value. This helper method returns the details of the
15987// match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
15988// to help generate the LXVKQ instruction and the subsequent shift instruction
15989// required to match the original build vector pattern.
15990
15991// LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
15992using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
15993
15994static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
15995
15996 // LXVKQ instruction loads the Quadword value:
15997 // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
15998 static const APInt BasePattern = APInt(128, 0x8000000000000000ULL) << 64;
15999 static const uint32_t Uim = 16;
16000
16001 // Check for direct LXVKQ match (no shift needed)
16002 if (FullVal == BasePattern)
16003 return std::make_tuple(args: Uim, args: uint8_t{0});
16004
16005 // Check if FullValue is 1 (the result of the base pattern >> 127)
16006 if (FullVal == APInt(128, 1))
16007 return std::make_tuple(args: Uim, args: uint8_t{127});
16008
16009 return std::nullopt;
16010}
16011
16012/// Combine vector loads to a single load (using lxvkq) or splat with shift of a
16013/// constant (xxspltib + vsrq) by recognising patterns in the Build Vector.
16014/// LXVKQ instruction load VSX vector with a special quadword value based on an
16015/// immediate value. if UIM=0b10000 then LXVKQ loads VSR[32×TX+T] with value
16016/// 0x8000_0000_0000_0000_0000_0000_0000_0000.
16017/// This can be used to inline the build vector constants that have the
16018/// following patterns:
16019///
16020/// 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
16021/// 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
16022/// MSB pattern can directly loaded using LXVKQ while LSB is loaded using a
16023/// combination of splatting and right shift instructions.
16024
16025SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
16026 SelectionDAG &DAG) const {
16027
16028 assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
16029 "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
16030
16031 // This transformation is only supported if we are loading either a byte,
16032 // halfword, word, or doubleword.
16033 EVT VT = Op.getValueType();
16034 if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
16035 VT == MVT::v2i64))
16036 return SDValue();
16037
16038 LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
16039 << VT.getEVTString() << "): ";
16040 Op->dump());
16041
16042 unsigned NumElems = VT.getVectorNumElements();
16043 unsigned ElemBits = VT.getScalarSizeInBits();
16044
16045 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
16046
16047 // Check for Non-constant operand in the build vector.
16048 for (const SDValue &Operand : Op.getNode()->op_values()) {
16049 if (!isa<ConstantSDNode>(Val: Operand))
16050 return SDValue();
16051 }
16052
16053 // Assemble build vector operands as a 128-bit register value
16054 // We need to reconstruct what the 128-bit register pattern would be
16055 // that produces this vector when interpreted with the current endianness
16056 APInt FullVal = APInt::getZero(numBits: 128);
16057
16058 for (unsigned Index = 0; Index < NumElems; ++Index) {
16059 auto *C = cast<ConstantSDNode>(Val: Op.getOperand(i: Index));
16060
16061 // Get element value as raw bits (zero-extended)
16062 uint64_t ElemValue = C->getZExtValue();
16063
16064 // Mask to element size to ensure we only get the relevant bits
16065 if (ElemBits < 64)
16066 ElemValue &= ((1ULL << ElemBits) - 1);
16067
16068 // Calculate bit position for this element in the 128-bit register
16069 unsigned BitPos =
16070 (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
16071
16072 // Create APInt for the element value and shift it to correct position
16073 APInt ElemAPInt(128, ElemValue);
16074 ElemAPInt <<= BitPos;
16075
16076 // Place the element value at the correct bit position
16077 FullVal |= ElemAPInt;
16078 }
16079
16080 if (FullVal.isZero() || FullVal.isAllOnes())
16081 return SDValue();
16082
16083 if (auto UIMOpt = getPatternInfo(FullVal)) {
16084 const auto &[Uim, ShiftAmount] = *UIMOpt;
16085 SDLoc Dl(Op);
16086
16087 // Generate LXVKQ instruction if the shift amount is zero.
16088 if (ShiftAmount == 0) {
16089 SDValue UimVal = DAG.getTargetConstant(Val: Uim, DL: Dl, VT: MVT::i32);
16090 SDValue LxvkqInstr =
16091 SDValue(DAG.getMachineNode(Opcode: PPC::LXVKQ, dl: Dl, VT, Op1: UimVal), 0);
16092 LLVM_DEBUG(llvm::dbgs()
16093 << "combineBVLoadsSpecialValue: Instruction Emitted ";
16094 LxvkqInstr.dump());
16095 return LxvkqInstr;
16096 }
16097
16098 assert(ShiftAmount == 127 && "Unexpected lxvkq shift amount value");
16099
16100 // The right shifted pattern can be constructed using a combination of
16101 // XXSPLTIB and VSRQ instruction. VSRQ uses the shift amount from the lower
16102 // 7 bits of byte 15. This can be specified using XXSPLTIB with immediate
16103 // value 255.
16104 SDValue ShiftAmountVec =
16105 SDValue(DAG.getMachineNode(Opcode: PPC::XXSPLTIB, dl: Dl, VT: MVT::v4i32,
16106 Op1: DAG.getTargetConstant(Val: 255, DL: Dl, VT: MVT::i32)),
16107 0);
16108 // Generate appropriate right shift instruction
16109 SDValue ShiftVec = SDValue(
16110 DAG.getMachineNode(Opcode: PPC::VSRQ, dl: Dl, VT, Op1: ShiftAmountVec, Op2: ShiftAmountVec),
16111 0);
16112 LLVM_DEBUG(llvm::dbgs()
16113 << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
16114 ShiftVec.dump());
16115 return ShiftVec;
16116 }
16117 // No patterns matched for build vectors.
16118 return SDValue();
16119}
16120
16121/// Reduce the number of loads when building a vector.
16122///
16123/// Building a vector out of multiple loads can be converted to a load
16124/// of the vector type if the loads are consecutive. If the loads are
16125/// consecutive but in descending order, a shuffle is added at the end
16126/// to reorder the vector.
16127static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
16128 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16129 "Should be called with a BUILD_VECTOR node");
16130
16131 SDLoc dl(N);
16132
16133 // Return early for non byte-sized type, as they can't be consecutive.
16134 if (!N->getValueType(ResNo: 0).getVectorElementType().isByteSized())
16135 return SDValue();
16136
16137 bool InputsAreConsecutiveLoads = true;
16138 bool InputsAreReverseConsecutive = true;
16139 unsigned ElemSize = N->getValueType(ResNo: 0).getScalarType().getStoreSize();
16140 SDValue FirstInput = N->getOperand(Num: 0);
16141 bool IsRoundOfExtLoad = false;
16142 LoadSDNode *FirstLoad = nullptr;
16143
16144 if (FirstInput.getOpcode() == ISD::FP_ROUND &&
16145 FirstInput.getOperand(i: 0).getOpcode() == ISD::LOAD) {
16146 FirstLoad = cast<LoadSDNode>(Val: FirstInput.getOperand(i: 0));
16147 IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD;
16148 }
16149 // Not a build vector of (possibly fp_rounded) loads.
16150 if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
16151 N->getNumOperands() == 1)
16152 return SDValue();
16153
16154 if (!IsRoundOfExtLoad)
16155 FirstLoad = cast<LoadSDNode>(Val&: FirstInput);
16156
16157 SmallVector<LoadSDNode *, 4> InputLoads;
16158 InputLoads.push_back(Elt: FirstLoad);
16159 for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
16160 // If any inputs are fp_round(extload), they all must be.
16161 if (IsRoundOfExtLoad && N->getOperand(Num: i).getOpcode() != ISD::FP_ROUND)
16162 return SDValue();
16163
16164 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(Num: i).getOperand(i: 0) :
16165 N->getOperand(Num: i);
16166 if (NextInput.getOpcode() != ISD::LOAD)
16167 return SDValue();
16168
16169 SDValue PreviousInput =
16170 IsRoundOfExtLoad ? N->getOperand(Num: i-1).getOperand(i: 0) : N->getOperand(Num: i-1);
16171 LoadSDNode *LD1 = cast<LoadSDNode>(Val&: PreviousInput);
16172 LoadSDNode *LD2 = cast<LoadSDNode>(Val&: NextInput);
16173
16174 // If any inputs are fp_round(extload), they all must be.
16175 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
16176 return SDValue();
16177
16178 // We only care about regular loads. The PPC-specific load intrinsics
16179 // will not lead to a merge opportunity.
16180 if (!DAG.areNonVolatileConsecutiveLoads(LD: LD2, Base: LD1, Bytes: ElemSize, Dist: 1))
16181 InputsAreConsecutiveLoads = false;
16182 if (!DAG.areNonVolatileConsecutiveLoads(LD: LD1, Base: LD2, Bytes: ElemSize, Dist: 1))
16183 InputsAreReverseConsecutive = false;
16184
16185 // Exit early if the loads are neither consecutive nor reverse consecutive.
16186 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
16187 return SDValue();
16188 InputLoads.push_back(Elt: LD2);
16189 }
16190
16191 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
16192 "The loads cannot be both consecutive and reverse consecutive.");
16193
16194 SDValue WideLoad;
16195 SDValue ReturnSDVal;
16196 if (InputsAreConsecutiveLoads) {
16197 assert(FirstLoad && "Input needs to be a LoadSDNode.");
16198 WideLoad = DAG.getLoad(VT: N->getValueType(ResNo: 0), dl, Chain: FirstLoad->getChain(),
16199 Ptr: FirstLoad->getBasePtr(), PtrInfo: FirstLoad->getPointerInfo(),
16200 Alignment: FirstLoad->getAlign());
16201 ReturnSDVal = WideLoad;
16202 } else if (InputsAreReverseConsecutive) {
16203 LoadSDNode *LastLoad = InputLoads.back();
16204 assert(LastLoad && "Input needs to be a LoadSDNode.");
16205 WideLoad = DAG.getLoad(VT: N->getValueType(ResNo: 0), dl, Chain: LastLoad->getChain(),
16206 Ptr: LastLoad->getBasePtr(), PtrInfo: LastLoad->getPointerInfo(),
16207 Alignment: LastLoad->getAlign());
16208 SmallVector<int, 16> Ops;
16209 for (int i = N->getNumOperands() - 1; i >= 0; i--)
16210 Ops.push_back(Elt: i);
16211
16212 ReturnSDVal = DAG.getVectorShuffle(VT: N->getValueType(ResNo: 0), dl, N1: WideLoad,
16213 N2: DAG.getUNDEF(VT: N->getValueType(ResNo: 0)), Mask: Ops);
16214 } else
16215 return SDValue();
16216
16217 for (auto *LD : InputLoads)
16218 DAG.makeEquivalentMemoryOrdering(OldLoad: LD, NewMemOp: WideLoad);
16219 return ReturnSDVal;
16220}
16221
16222// This function adds the required vector_shuffle needed to get
16223// the elements of the vector extract in the correct position
16224// as specified by the CorrectElems encoding.
16225static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
16226 SDValue Input, uint64_t Elems,
16227 uint64_t CorrectElems) {
16228 SDLoc dl(N);
16229
16230 unsigned NumElems = Input.getValueType().getVectorNumElements();
16231 SmallVector<int, 16> ShuffleMask(NumElems, -1);
16232
16233 // Knowing the element indices being extracted from the original
16234 // vector and the order in which they're being inserted, just put
16235 // them at element indices required for the instruction.
16236 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16237 if (DAG.getDataLayout().isLittleEndian())
16238 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
16239 else
16240 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
16241 CorrectElems = CorrectElems >> 8;
16242 Elems = Elems >> 8;
16243 }
16244
16245 SDValue Shuffle =
16246 DAG.getVectorShuffle(VT: Input.getValueType(), dl, N1: Input,
16247 N2: DAG.getUNDEF(VT: Input.getValueType()), Mask: ShuffleMask);
16248
16249 EVT VT = N->getValueType(ResNo: 0);
16250 SDValue Conv = DAG.getBitcast(VT, V: Shuffle);
16251
16252 EVT ExtVT = EVT::getVectorVT(Context&: *DAG.getContext(),
16253 VT: Input.getValueType().getVectorElementType(),
16254 NumElements: VT.getVectorNumElements());
16255 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT, N1: Conv,
16256 N2: DAG.getValueType(ExtVT));
16257}
16258
16259// Look for build vector patterns where input operands come from sign
16260// extended vector_extract elements of specific indices. If the correct indices
16261// aren't used, add a vector shuffle to fix up the indices and create
16262// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
16263// during instruction selection.
16264static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
16265 // This array encodes the indices that the vector sign extend instructions
16266 // extract from when extending from one type to another for both BE and LE.
16267 // The right nibble of each byte corresponds to the LE incides.
16268 // and the left nibble of each byte corresponds to the BE incides.
16269 // For example: 0x3074B8FC byte->word
16270 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
16271 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
16272 // For example: 0x000070F8 byte->double word
16273 // For LE: the allowed indices are: 0x0,0x8
16274 // For BE: the allowed indices are: 0x7,0xF
16275 uint64_t TargetElems[] = {
16276 0x3074B8FC, // b->w
16277 0x000070F8, // b->d
16278 0x10325476, // h->w
16279 0x00003074, // h->d
16280 0x00001032, // w->d
16281 };
16282
16283 uint64_t Elems = 0;
16284 int Index;
16285 SDValue Input;
16286
16287 auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
16288 if (!Op)
16289 return false;
16290 if (Op.getOpcode() != ISD::SIGN_EXTEND &&
16291 Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
16292 return false;
16293
16294 // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
16295 // of the right width.
16296 SDValue Extract = Op.getOperand(i: 0);
16297 if (Extract.getOpcode() == ISD::ANY_EXTEND)
16298 Extract = Extract.getOperand(i: 0);
16299 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16300 return false;
16301
16302 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Val: Extract.getOperand(i: 1));
16303 if (!ExtOp)
16304 return false;
16305
16306 Index = ExtOp->getZExtValue();
16307 if (Input && Input != Extract.getOperand(i: 0))
16308 return false;
16309
16310 if (!Input)
16311 Input = Extract.getOperand(i: 0);
16312
16313 Elems = Elems << 8;
16314 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
16315 Elems |= Index;
16316
16317 return true;
16318 };
16319
16320 // If the build vector operands aren't sign extended vector extracts,
16321 // of the same input vector, then return.
16322 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16323 if (!isSExtOfVecExtract(N->getOperand(Num: i))) {
16324 return SDValue();
16325 }
16326 }
16327
16328 // If the vector extract indices are not correct, add the appropriate
16329 // vector_shuffle.
16330 int TgtElemArrayIdx;
16331 int InputSize = Input.getValueType().getScalarSizeInBits();
16332 int OutputSize = N->getValueType(ResNo: 0).getScalarSizeInBits();
16333 if (InputSize + OutputSize == 40)
16334 TgtElemArrayIdx = 0;
16335 else if (InputSize + OutputSize == 72)
16336 TgtElemArrayIdx = 1;
16337 else if (InputSize + OutputSize == 48)
16338 TgtElemArrayIdx = 2;
16339 else if (InputSize + OutputSize == 80)
16340 TgtElemArrayIdx = 3;
16341 else if (InputSize + OutputSize == 96)
16342 TgtElemArrayIdx = 4;
16343 else
16344 return SDValue();
16345
16346 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
16347 CorrectElems = DAG.getDataLayout().isLittleEndian()
16348 ? CorrectElems & 0x0F0F0F0F0F0F0F0F
16349 : CorrectElems & 0xF0F0F0F0F0F0F0F0;
16350 if (Elems != CorrectElems) {
16351 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
16352 }
16353
16354 // Regular lowering will catch cases where a shuffle is not needed.
16355 return SDValue();
16356}
16357
16358// Look for the pattern of a load from a narrow width to i128, feeding
16359// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
16360// (LXVRZX). This node represents a zero extending load that will be matched
16361// to the Load VSX Vector Rightmost instructions.
16362static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
16363 SDLoc DL(N);
16364
16365 // This combine is only eligible for a BUILD_VECTOR of v1i128.
16366 if (N->getValueType(ResNo: 0) != MVT::v1i128)
16367 return SDValue();
16368
16369 SDValue Operand = N->getOperand(Num: 0);
16370 // Proceed with the transformation if the operand to the BUILD_VECTOR
16371 // is a load instruction.
16372 if (Operand.getOpcode() != ISD::LOAD)
16373 return SDValue();
16374
16375 auto *LD = cast<LoadSDNode>(Val&: Operand);
16376 EVT MemoryType = LD->getMemoryVT();
16377
16378 // This transformation is only valid if the we are loading either a byte,
16379 // halfword, word, or doubleword.
16380 bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
16381 MemoryType == MVT::i32 || MemoryType == MVT::i64;
16382
16383 // Ensure that the load from the narrow width is being zero extended to i128.
16384 if (!ValidLDType ||
16385 (LD->getExtensionType() != ISD::ZEXTLOAD &&
16386 LD->getExtensionType() != ISD::EXTLOAD))
16387 return SDValue();
16388
16389 SDValue LoadOps[] = {
16390 LD->getChain(), LD->getBasePtr(),
16391 DAG.getIntPtrConstant(Val: MemoryType.getScalarSizeInBits(), DL)};
16392
16393 return DAG.getMemIntrinsicNode(Opcode: PPCISD::LXVRZX, dl: DL,
16394 VTList: DAG.getVTList(VT1: MVT::v1i128, VT2: MVT::Other),
16395 Ops: LoadOps, MemVT: MemoryType, MMO: LD->getMemOperand());
16396}
16397
16398SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
16399 DAGCombinerInfo &DCI) const {
16400 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16401 "Should be called with a BUILD_VECTOR node");
16402
16403 SelectionDAG &DAG = DCI.DAG;
16404 SDLoc dl(N);
16405
16406 if (!Subtarget.hasVSX())
16407 return SDValue();
16408
16409 // The target independent DAG combiner will leave a build_vector of
16410 // float-to-int conversions intact. We can generate MUCH better code for
16411 // a float-to-int conversion of a vector of floats.
16412 SDValue FirstInput = N->getOperand(Num: 0);
16413 if (FirstInput.getOpcode() == PPCISD::MFVSR) {
16414 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
16415 if (Reduced)
16416 return Reduced;
16417 }
16418
16419 // If we're building a vector out of consecutive loads, just load that
16420 // vector type.
16421 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
16422 if (Reduced)
16423 return Reduced;
16424
16425 // If we're building a vector out of extended elements from another vector
16426 // we have P9 vector integer extend instructions. The code assumes legal
16427 // input types (i.e. it can't handle things like v4i16) so do not run before
16428 // legalization.
16429 if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
16430 Reduced = combineBVOfVecSExt(N, DAG);
16431 if (Reduced)
16432 return Reduced;
16433 }
16434
16435 // On Power10, the Load VSX Vector Rightmost instructions can be utilized
16436 // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
16437 // is a load from <valid narrow width> to i128.
16438 if (Subtarget.isISA3_1()) {
16439 SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
16440 if (BVOfZLoad)
16441 return BVOfZLoad;
16442 }
16443
16444 if (N->getValueType(ResNo: 0) != MVT::v2f64)
16445 return SDValue();
16446
16447 // Looking for:
16448 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
16449 if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
16450 FirstInput.getOpcode() != ISD::UINT_TO_FP)
16451 return SDValue();
16452 if (N->getOperand(Num: 1).getOpcode() != ISD::SINT_TO_FP &&
16453 N->getOperand(Num: 1).getOpcode() != ISD::UINT_TO_FP)
16454 return SDValue();
16455 if (FirstInput.getOpcode() != N->getOperand(Num: 1).getOpcode())
16456 return SDValue();
16457
16458 SDValue Ext1 = FirstInput.getOperand(i: 0);
16459 SDValue Ext2 = N->getOperand(Num: 1).getOperand(i: 0);
16460 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16461 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16462 return SDValue();
16463
16464 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Val: Ext1.getOperand(i: 1));
16465 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Val: Ext2.getOperand(i: 1));
16466 if (!Ext1Op || !Ext2Op)
16467 return SDValue();
16468 if (Ext1.getOperand(i: 0).getValueType() != MVT::v4i32 ||
16469 Ext1.getOperand(i: 0) != Ext2.getOperand(i: 0))
16470 return SDValue();
16471
16472 int FirstElem = Ext1Op->getZExtValue();
16473 int SecondElem = Ext2Op->getZExtValue();
16474 int SubvecIdx;
16475 if (FirstElem == 0 && SecondElem == 1)
16476 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
16477 else if (FirstElem == 2 && SecondElem == 3)
16478 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
16479 else
16480 return SDValue();
16481
16482 SDValue SrcVec = Ext1.getOperand(i: 0);
16483 auto NodeType = (N->getOperand(Num: 1).getOpcode() == ISD::SINT_TO_FP) ?
16484 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
16485 return DAG.getNode(Opcode: NodeType, DL: dl, VT: MVT::v2f64,
16486 N1: SrcVec, N2: DAG.getIntPtrConstant(Val: SubvecIdx, DL: dl));
16487}
16488
16489SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
16490 DAGCombinerInfo &DCI) const {
16491 assert((N->getOpcode() == ISD::SINT_TO_FP ||
16492 N->getOpcode() == ISD::UINT_TO_FP) &&
16493 "Need an int -> FP conversion node here");
16494
16495 if (useSoftFloat() || !Subtarget.has64BitSupport())
16496 return SDValue();
16497
16498 SelectionDAG &DAG = DCI.DAG;
16499 SDLoc dl(N);
16500 SDValue Op(N, 0);
16501
16502 // Don't handle ppc_fp128 here or conversions that are out-of-range capable
16503 // from the hardware.
16504 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
16505 return SDValue();
16506 if (!Op.getOperand(i: 0).getValueType().isSimple())
16507 return SDValue();
16508 if (Op.getOperand(i: 0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
16509 Op.getOperand(i: 0).getValueType().getSimpleVT() > MVT(MVT::i64))
16510 return SDValue();
16511
16512 SDValue FirstOperand(Op.getOperand(i: 0));
16513 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
16514 (FirstOperand.getValueType() == MVT::i8 ||
16515 FirstOperand.getValueType() == MVT::i16);
16516 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
16517 bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
16518 bool DstDouble = Op.getValueType() == MVT::f64;
16519 unsigned ConvOp = Signed ?
16520 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
16521 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
16522 SDValue WidthConst =
16523 DAG.getIntPtrConstant(Val: FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
16524 DL: dl, isTarget: false);
16525 LoadSDNode *LDN = cast<LoadSDNode>(Val: FirstOperand.getNode());
16526 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
16527 SDValue Ld = DAG.getMemIntrinsicNode(Opcode: PPCISD::LXSIZX, dl,
16528 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
16529 Ops, MemVT: MVT::i8, MMO: LDN->getMemOperand());
16530 DAG.makeEquivalentMemoryOrdering(OldLoad: LDN, NewMemOp: Ld);
16531
16532 // For signed conversion, we need to sign-extend the value in the VSR
16533 if (Signed) {
16534 SDValue ExtOps[] = { Ld, WidthConst };
16535 SDValue Ext = DAG.getNode(Opcode: PPCISD::VEXTS, DL: dl, VT: MVT::f64, Ops: ExtOps);
16536 return DAG.getNode(Opcode: ConvOp, DL: dl, VT: DstDouble ? MVT::f64 : MVT::f32, Operand: Ext);
16537 } else
16538 return DAG.getNode(Opcode: ConvOp, DL: dl, VT: DstDouble ? MVT::f64 : MVT::f32, Operand: Ld);
16539 }
16540
16541
16542 // For i32 intermediate values, unfortunately, the conversion functions
16543 // leave the upper 32 bits of the value are undefined. Within the set of
16544 // scalar instructions, we have no method for zero- or sign-extending the
16545 // value. Thus, we cannot handle i32 intermediate values here.
16546 if (Op.getOperand(i: 0).getValueType() == MVT::i32)
16547 return SDValue();
16548
16549 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
16550 "UINT_TO_FP is supported only with FPCVT");
16551
16552 // If we have FCFIDS, then use it when converting to single-precision.
16553 // Otherwise, convert to double-precision and then round.
16554 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
16555 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
16556 : PPCISD::FCFIDS)
16557 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
16558 : PPCISD::FCFID);
16559 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
16560 ? MVT::f32
16561 : MVT::f64;
16562
16563 // If we're converting from a float, to an int, and back to a float again,
16564 // then we don't need the store/load pair at all.
16565 if ((Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_UINT &&
16566 Subtarget.hasFPCVT()) ||
16567 (Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_SINT)) {
16568 SDValue Src = Op.getOperand(i: 0).getOperand(i: 0);
16569 if (Src.getValueType() == MVT::f32) {
16570 Src = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Src);
16571 DCI.AddToWorklist(N: Src.getNode());
16572 } else if (Src.getValueType() != MVT::f64) {
16573 // Make sure that we don't pick up a ppc_fp128 source value.
16574 return SDValue();
16575 }
16576
16577 unsigned FCTOp =
16578 Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
16579 PPCISD::FCTIDUZ;
16580
16581 SDValue Tmp = DAG.getNode(Opcode: FCTOp, DL: dl, VT: MVT::f64, Operand: Src);
16582 SDValue FP = DAG.getNode(Opcode: FCFOp, DL: dl, VT: FCFTy, Operand: Tmp);
16583
16584 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
16585 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
16586 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
16587 DCI.AddToWorklist(N: FP.getNode());
16588 }
16589
16590 return FP;
16591 }
16592
16593 return SDValue();
16594}
16595
16596// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
16597// builtins) into loads with swaps.
16598SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
16599 DAGCombinerInfo &DCI) const {
16600 // Delay VSX load for LE combine until after LegalizeOps to prioritize other
16601 // load combines.
16602 if (DCI.isBeforeLegalizeOps())
16603 return SDValue();
16604
16605 SelectionDAG &DAG = DCI.DAG;
16606 SDLoc dl(N);
16607 SDValue Chain;
16608 SDValue Base;
16609 MachineMemOperand *MMO;
16610
16611 switch (N->getOpcode()) {
16612 default:
16613 llvm_unreachable("Unexpected opcode for little endian VSX load");
16614 case ISD::LOAD: {
16615 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
16616 Chain = LD->getChain();
16617 Base = LD->getBasePtr();
16618 MMO = LD->getMemOperand();
16619 // If the MMO suggests this isn't a load of a full vector, leave
16620 // things alone. For a built-in, we have to make the change for
16621 // correctness, so if there is a size problem that will be a bug.
16622 if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
16623 return SDValue();
16624 break;
16625 }
16626 case ISD::INTRINSIC_W_CHAIN: {
16627 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(Val: N);
16628 Chain = Intrin->getChain();
16629 // Similarly to the store case below, Intrin->getBasePtr() doesn't get
16630 // us what we want. Get operand 2 instead.
16631 Base = Intrin->getOperand(Num: 2);
16632 MMO = Intrin->getMemOperand();
16633 break;
16634 }
16635 }
16636
16637 MVT VecTy = N->getValueType(ResNo: 0).getSimpleVT();
16638
16639 SDValue LoadOps[] = { Chain, Base };
16640 SDValue Load = DAG.getMemIntrinsicNode(Opcode: PPCISD::LXVD2X, dl,
16641 VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other),
16642 Ops: LoadOps, MemVT: MVT::v2f64, MMO);
16643
16644 DCI.AddToWorklist(N: Load.getNode());
16645 Chain = Load.getValue(R: 1);
16646 SDValue Swap = DAG.getNode(
16647 Opcode: PPCISD::XXSWAPD, DL: dl, VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other), N1: Chain, N2: Load);
16648 DCI.AddToWorklist(N: Swap.getNode());
16649
16650 // Add a bitcast if the resulting load type doesn't match v2f64.
16651 if (VecTy != MVT::v2f64) {
16652 SDValue N = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecTy, Operand: Swap);
16653 DCI.AddToWorklist(N: N.getNode());
16654 // Package {bitcast value, swap's chain} to match Load's shape.
16655 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: DAG.getVTList(VT1: VecTy, VT2: MVT::Other),
16656 N1: N, N2: Swap.getValue(R: 1));
16657 }
16658
16659 return Swap;
16660}
16661
16662// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
16663// builtins) into stores with swaps.
16664SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
16665 DAGCombinerInfo &DCI) const {
16666 // Delay VSX store for LE combine until after LegalizeOps to prioritize other
16667 // store combines.
16668 if (DCI.isBeforeLegalizeOps())
16669 return SDValue();
16670
16671 SelectionDAG &DAG = DCI.DAG;
16672 SDLoc dl(N);
16673 SDValue Chain;
16674 SDValue Base;
16675 unsigned SrcOpnd;
16676 MachineMemOperand *MMO;
16677
16678 switch (N->getOpcode()) {
16679 default:
16680 llvm_unreachable("Unexpected opcode for little endian VSX store");
16681 case ISD::STORE: {
16682 StoreSDNode *ST = cast<StoreSDNode>(Val: N);
16683 Chain = ST->getChain();
16684 Base = ST->getBasePtr();
16685 MMO = ST->getMemOperand();
16686 SrcOpnd = 1;
16687 // If the MMO suggests this isn't a store of a full vector, leave
16688 // things alone. For a built-in, we have to make the change for
16689 // correctness, so if there is a size problem that will be a bug.
16690 if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
16691 return SDValue();
16692 break;
16693 }
16694 case ISD::INTRINSIC_VOID: {
16695 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(Val: N);
16696 Chain = Intrin->getChain();
16697 // Intrin->getBasePtr() oddly does not get what we want.
16698 Base = Intrin->getOperand(Num: 3);
16699 MMO = Intrin->getMemOperand();
16700 SrcOpnd = 2;
16701 break;
16702 }
16703 }
16704
16705 SDValue Src = N->getOperand(Num: SrcOpnd);
16706 MVT VecTy = Src.getValueType().getSimpleVT();
16707
16708 // All stores are done as v2f64 and possible bit cast.
16709 if (VecTy != MVT::v2f64) {
16710 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2f64, Operand: Src);
16711 DCI.AddToWorklist(N: Src.getNode());
16712 }
16713
16714 SDValue Swap = DAG.getNode(Opcode: PPCISD::XXSWAPD, DL: dl,
16715 VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other), N1: Chain, N2: Src);
16716 DCI.AddToWorklist(N: Swap.getNode());
16717 Chain = Swap.getValue(R: 1);
16718 SDValue StoreOps[] = { Chain, Swap, Base };
16719 SDValue Store = DAG.getMemIntrinsicNode(Opcode: PPCISD::STXVD2X, dl,
16720 VTList: DAG.getVTList(VT: MVT::Other),
16721 Ops: StoreOps, MemVT: VecTy, MMO);
16722 DCI.AddToWorklist(N: Store.getNode());
16723 return Store;
16724}
16725
16726// Handle DAG combine for STORE (FP_TO_INT F).
16727SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
16728 DAGCombinerInfo &DCI) const {
16729 SelectionDAG &DAG = DCI.DAG;
16730 SDLoc dl(N);
16731 unsigned Opcode = N->getOperand(Num: 1).getOpcode();
16732 (void)Opcode;
16733 bool Strict = N->getOperand(Num: 1)->isStrictFPOpcode();
16734
16735 assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
16736 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT)
16737 && "Not a FP_TO_INT Instruction!");
16738
16739 SDValue Val = N->getOperand(Num: 1).getOperand(i: Strict ? 1 : 0);
16740 EVT Op1VT = N->getOperand(Num: 1).getValueType();
16741 EVT ResVT = Val.getValueType();
16742
16743 if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(VT: ResVT))
16744 return SDValue();
16745
16746 // Only perform combine for conversion to i64/i32 or power9 i16/i8.
16747 bool ValidTypeForStoreFltAsInt =
16748 (Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) ||
16749 (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
16750
16751 // TODO: Lower conversion from f128 on all VSX targets
16752 if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector()))
16753 return SDValue();
16754
16755 if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) ||
16756 cast<StoreSDNode>(Val: N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
16757 return SDValue();
16758
16759 Val = convertFPToInt(Op: N->getOperand(Num: 1), DAG, Subtarget);
16760
16761 // Set number of bytes being converted.
16762 unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
16763 SDValue Ops[] = {N->getOperand(Num: 0), Val, N->getOperand(Num: 2),
16764 DAG.getIntPtrConstant(Val: ByteSize, DL: dl, isTarget: false),
16765 DAG.getValueType(Op1VT)};
16766
16767 Val = DAG.getMemIntrinsicNode(Opcode: PPCISD::ST_VSR_SCAL_INT, dl,
16768 VTList: DAG.getVTList(VT: MVT::Other), Ops,
16769 MemVT: cast<StoreSDNode>(Val: N)->getMemoryVT(),
16770 MMO: cast<StoreSDNode>(Val: N)->getMemOperand());
16771
16772 return Val;
16773}
16774
16775static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
16776 // Check that the source of the element keeps flipping
16777 // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
16778 bool PrevElemFromFirstVec = Mask[0] < NumElts;
16779 for (int i = 1, e = Mask.size(); i < e; i++) {
16780 if (PrevElemFromFirstVec && Mask[i] < NumElts)
16781 return false;
16782 if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
16783 return false;
16784 PrevElemFromFirstVec = !PrevElemFromFirstVec;
16785 }
16786 return true;
16787}
16788
16789static bool isSplatBV(SDValue Op) {
16790 if (Op.getOpcode() != ISD::BUILD_VECTOR)
16791 return false;
16792 SDValue FirstOp;
16793
16794 // Find first non-undef input.
16795 for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
16796 FirstOp = Op.getOperand(i);
16797 if (!FirstOp.isUndef())
16798 break;
16799 }
16800
16801 // All inputs are undef or the same as the first non-undef input.
16802 for (int i = 1, e = Op.getNumOperands(); i < e; i++)
16803 if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
16804 return false;
16805 return true;
16806}
16807
16808static SDValue isScalarToVec(SDValue Op) {
16809 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
16810 return Op;
16811 if (Op.getOpcode() != ISD::BITCAST)
16812 return SDValue();
16813 Op = Op.getOperand(i: 0);
16814 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
16815 return Op;
16816 return SDValue();
16817}
16818
16819// Fix up the shuffle mask to account for the fact that the result of
16820// scalar_to_vector is not in lane zero. This just takes all values in
16821// the ranges specified by the min/max indices and adds the number of
16822// elements required to ensure each element comes from the respective
16823// position in the valid lane.
16824// On little endian, that's just the corresponding element in the other
16825// half of the vector. On big endian, it is in the same half but right
16826// justified rather than left justified in that half.
16827static void fixupShuffleMaskForPermutedSToV(
16828 SmallVectorImpl<int> &ShuffV, int LHSFirstElt, int LHSLastElt,
16829 int RHSFirstElt, int RHSLastElt, int HalfVec, unsigned LHSNumValidElts,
16830 unsigned RHSNumValidElts, const PPCSubtarget &Subtarget) {
16831 int LHSEltFixup =
16832 Subtarget.isLittleEndian() ? HalfVec : HalfVec - LHSNumValidElts;
16833 int RHSEltFixup =
16834 Subtarget.isLittleEndian() ? HalfVec : HalfVec - RHSNumValidElts;
16835 for (int I = 0, E = ShuffV.size(); I < E; ++I) {
16836 int Idx = ShuffV[I];
16837 if (Idx >= LHSFirstElt && Idx <= LHSLastElt)
16838 ShuffV[I] += LHSEltFixup;
16839 else if (Idx >= RHSFirstElt && Idx <= RHSLastElt)
16840 ShuffV[I] += RHSEltFixup;
16841 }
16842}
16843
16844// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
16845// the original is:
16846// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
16847// In such a case, just change the shuffle mask to extract the element
16848// from the permuted index.
16849static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
16850 const PPCSubtarget &Subtarget) {
16851 SDLoc dl(OrigSToV);
16852 EVT VT = OrigSToV.getValueType();
16853 assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
16854 "Expecting a SCALAR_TO_VECTOR here");
16855 SDValue Input = OrigSToV.getOperand(i: 0);
16856
16857 if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
16858 ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: Input.getOperand(i: 1));
16859 SDValue OrigVector = Input.getOperand(i: 0);
16860
16861 // Can't handle non-const element indices or different vector types
16862 // for the input to the extract and the output of the scalar_to_vector.
16863 if (Idx && VT == OrigVector.getValueType()) {
16864 unsigned NumElts = VT.getVectorNumElements();
16865 assert(
16866 NumElts > 1 &&
16867 "Cannot produce a permuted scalar_to_vector for one element vector");
16868 SmallVector<int, 16> NewMask(NumElts, -1);
16869 unsigned ResultInElt = NumElts / 2;
16870 ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
16871 NewMask[ResultInElt] = Idx->getZExtValue();
16872 return DAG.getVectorShuffle(VT, dl, N1: OrigVector, N2: OrigVector, Mask: NewMask);
16873 }
16874 }
16875 return DAG.getNode(Opcode: PPCISD::SCALAR_TO_VECTOR_PERMUTED, DL: dl, VT,
16876 Operand: OrigSToV.getOperand(i: 0));
16877}
16878
16879static bool isShuffleMaskInRange(const SmallVectorImpl<int> &ShuffV,
16880 int HalfVec, int LHSLastElementDefined,
16881 int RHSLastElementDefined) {
16882 for (int Index : ShuffV) {
16883 if (Index < 0) // Skip explicitly undefined mask indices.
16884 continue;
16885 // Handle first input vector of the vector_shuffle.
16886 if ((LHSLastElementDefined >= 0) && (Index < HalfVec) &&
16887 (Index > LHSLastElementDefined))
16888 return false;
16889 // Handle second input vector of the vector_shuffle.
16890 if ((RHSLastElementDefined >= 0) &&
16891 (Index > HalfVec + RHSLastElementDefined))
16892 return false;
16893 }
16894 return true;
16895}
16896
16897static SDValue generateSToVPermutedForVecShuffle(
16898 int ScalarSize, uint64_t ShuffleEltWidth, unsigned &NumValidElts,
16899 int FirstElt, int &LastElt, SDValue VecShuffOperand, SDValue SToVNode,
16900 SelectionDAG &DAG, const PPCSubtarget &Subtarget) {
16901 EVT VecShuffOperandType = VecShuffOperand.getValueType();
16902 // Set up the values for the shuffle vector fixup.
16903 NumValidElts = ScalarSize / VecShuffOperandType.getScalarSizeInBits();
16904 // The last element depends on if the input comes from the LHS or RHS.
16905 //
16906 // For example:
16907 // (shuff (s_to_v i32), (bitcast (s_to_v i64), v4i32), ...)
16908 //
16909 // For the LHS: The last element that comes from the LHS is actually 0, not 3
16910 // because elements 1 and higher of a scalar_to_vector are undefined.
16911 // For the RHS: The last element that comes from the RHS is actually 5, not 7
16912 // because elements 1 and higher of a scalar_to_vector are undefined.
16913 // It is also not 4 because the original scalar_to_vector is wider and
16914 // actually contains two i32 elements.
16915 LastElt = (uint64_t)ScalarSize > ShuffleEltWidth
16916 ? ScalarSize / ShuffleEltWidth - 1 + FirstElt
16917 : FirstElt;
16918 SDValue SToVPermuted = getSToVPermuted(OrigSToV: SToVNode, DAG, Subtarget);
16919 if (SToVPermuted.getValueType() != VecShuffOperandType)
16920 SToVPermuted = DAG.getBitcast(VT: VecShuffOperandType, V: SToVPermuted);
16921 return SToVPermuted;
16922}
16923
16924// On little endian subtargets, combine shuffles such as:
16925// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
16926// into:
16927// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
16928// because the latter can be matched to a single instruction merge.
16929// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
16930// to put the value into element zero. Adjust the shuffle mask so that the
16931// vector can remain in permuted form (to prevent a swap prior to a shuffle).
16932// On big endian targets, this is still useful for SCALAR_TO_VECTOR
16933// nodes with elements smaller than doubleword because all the ways
16934// of getting scalar data into a vector register put the value in the
16935// rightmost element of the left half of the vector.
16936SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
16937 SelectionDAG &DAG) const {
16938 SDValue LHS = SVN->getOperand(Num: 0);
16939 SDValue RHS = SVN->getOperand(Num: 1);
16940 auto Mask = SVN->getMask();
16941 int NumElts = LHS.getValueType().getVectorNumElements();
16942 SDValue Res(SVN, 0);
16943 SDLoc dl(SVN);
16944 bool IsLittleEndian = Subtarget.isLittleEndian();
16945
16946 // On big endian targets this is only useful for subtargets with direct moves.
16947 // On little endian targets it would be useful for all subtargets with VSX.
16948 // However adding special handling for LE subtargets without direct moves
16949 // would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)
16950 // which includes direct moves.
16951 if (!Subtarget.hasDirectMove())
16952 return Res;
16953
16954 // If this is not a shuffle of a shuffle and the first element comes from
16955 // the second vector, canonicalize to the commuted form. This will make it
16956 // more likely to match one of the single instruction patterns.
16957 if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
16958 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
16959 std::swap(a&: LHS, b&: RHS);
16960 Res = DAG.getCommutedVectorShuffle(SV: *SVN);
16961
16962 if (!isa<ShuffleVectorSDNode>(Val: Res))
16963 return Res;
16964
16965 Mask = cast<ShuffleVectorSDNode>(Val&: Res)->getMask();
16966 }
16967
16968 // Adjust the shuffle mask if either input vector comes from a
16969 // SCALAR_TO_VECTOR and keep the respective input vector in permuted
16970 // form (to prevent the need for a swap).
16971 SmallVector<int, 16> ShuffV(Mask);
16972 SDValue SToVLHS = isScalarToVec(Op: LHS);
16973 SDValue SToVRHS = isScalarToVec(Op: RHS);
16974 if (SToVLHS || SToVRHS) {
16975 EVT VT = SVN->getValueType(ResNo: 0);
16976 uint64_t ShuffleEltWidth = VT.getVectorElementType().getSizeInBits();
16977 int ShuffleNumElts = ShuffV.size();
16978 int HalfVec = ShuffleNumElts / 2;
16979 // The width of the "valid lane" (i.e. the lane that contains the value that
16980 // is vectorized) needs to be expressed in terms of the number of elements
16981 // of the shuffle. It is thereby the ratio of the values before and after
16982 // any bitcast, which will be set later on if the LHS or RHS are
16983 // SCALAR_TO_VECTOR nodes.
16984 unsigned LHSNumValidElts = HalfVec;
16985 unsigned RHSNumValidElts = HalfVec;
16986
16987 // Initially assume that neither input is permuted. These will be adjusted
16988 // accordingly if either input is. Note, that -1 means that all elements
16989 // are undefined.
16990 int LHSFirstElt = 0;
16991 int RHSFirstElt = ShuffleNumElts;
16992 int LHSLastElt = -1;
16993 int RHSLastElt = -1;
16994
16995 // Get the permuted scalar to vector nodes for the source(s) that come from
16996 // ISD::SCALAR_TO_VECTOR.
16997 // On big endian systems, this only makes sense for element sizes smaller
16998 // than 64 bits since for 64-bit elements, all instructions already put
16999 // the value into element zero. Since scalar size of LHS and RHS may differ
17000 // after isScalarToVec, this should be checked using their own sizes.
17001 int LHSScalarSize = 0;
17002 int RHSScalarSize = 0;
17003 if (SToVLHS) {
17004 LHSScalarSize = SToVLHS.getValueType().getScalarSizeInBits();
17005 if (!IsLittleEndian && LHSScalarSize >= 64)
17006 return Res;
17007 }
17008 if (SToVRHS) {
17009 RHSScalarSize = SToVRHS.getValueType().getScalarSizeInBits();
17010 if (!IsLittleEndian && RHSScalarSize >= 64)
17011 return Res;
17012 }
17013 if (LHSScalarSize != 0)
17014 LHS = generateSToVPermutedForVecShuffle(
17015 ScalarSize: LHSScalarSize, ShuffleEltWidth, NumValidElts&: LHSNumValidElts, FirstElt: LHSFirstElt,
17016 LastElt&: LHSLastElt, VecShuffOperand: LHS, SToVNode: SToVLHS, DAG, Subtarget);
17017 if (RHSScalarSize != 0)
17018 RHS = generateSToVPermutedForVecShuffle(
17019 ScalarSize: RHSScalarSize, ShuffleEltWidth, NumValidElts&: RHSNumValidElts, FirstElt: RHSFirstElt,
17020 LastElt&: RHSLastElt, VecShuffOperand: RHS, SToVNode: SToVRHS, DAG, Subtarget);
17021
17022 if (!isShuffleMaskInRange(ShuffV, HalfVec, LHSLastElementDefined: LHSLastElt, RHSLastElementDefined: RHSLastElt))
17023 return Res;
17024
17025 // Fix up the shuffle mask to reflect where the desired element actually is.
17026 // The minimum and maximum indices that correspond to element zero for both
17027 // the LHS and RHS are computed and will control which shuffle mask entries
17028 // are to be changed. For example, if the RHS is permuted, any shuffle mask
17029 // entries in the range [RHSFirstElt,RHSLastElt] will be adjusted.
17030 fixupShuffleMaskForPermutedSToV(
17031 ShuffV, LHSFirstElt, LHSLastElt, RHSFirstElt, RHSLastElt, HalfVec,
17032 LHSNumValidElts, RHSNumValidElts, Subtarget);
17033 Res = DAG.getVectorShuffle(VT: SVN->getValueType(ResNo: 0), dl, N1: LHS, N2: RHS, Mask: ShuffV);
17034
17035 // We may have simplified away the shuffle. We won't be able to do anything
17036 // further with it here.
17037 if (!isa<ShuffleVectorSDNode>(Val: Res))
17038 return Res;
17039 Mask = cast<ShuffleVectorSDNode>(Val&: Res)->getMask();
17040 }
17041
17042 SDValue TheSplat = IsLittleEndian ? RHS : LHS;
17043 // The common case after we commuted the shuffle is that the RHS is a splat
17044 // and we have elements coming in from the splat at indices that are not
17045 // conducive to using a merge.
17046 // Example:
17047 // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
17048 if (!isSplatBV(Op: TheSplat))
17049 return Res;
17050
17051 // We are looking for a mask such that all even elements are from
17052 // one vector and all odd elements from the other.
17053 if (!isAlternatingShuffMask(Mask, NumElts))
17054 return Res;
17055
17056 // Adjust the mask so we are pulling in the same index from the splat
17057 // as the index from the interesting vector in consecutive elements.
17058 if (IsLittleEndian) {
17059 // Example (even elements from first vector):
17060 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
17061 if (Mask[0] < NumElts)
17062 for (int i = 1, e = Mask.size(); i < e; i += 2) {
17063 if (ShuffV[i] < 0)
17064 continue;
17065 // If element from non-splat is undef, pick first element from splat.
17066 ShuffV[i] = (ShuffV[i - 1] >= 0 ? ShuffV[i - 1] : 0) + NumElts;
17067 }
17068 // Example (odd elements from first vector):
17069 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
17070 else
17071 for (int i = 0, e = Mask.size(); i < e; i += 2) {
17072 if (ShuffV[i] < 0)
17073 continue;
17074 // If element from non-splat is undef, pick first element from splat.
17075 ShuffV[i] = (ShuffV[i + 1] >= 0 ? ShuffV[i + 1] : 0) + NumElts;
17076 }
17077 } else {
17078 // Example (even elements from first vector):
17079 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
17080 if (Mask[0] < NumElts)
17081 for (int i = 0, e = Mask.size(); i < e; i += 2) {
17082 if (ShuffV[i] < 0)
17083 continue;
17084 // If element from non-splat is undef, pick first element from splat.
17085 ShuffV[i] = ShuffV[i + 1] >= 0 ? ShuffV[i + 1] - NumElts : 0;
17086 }
17087 // Example (odd elements from first vector):
17088 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
17089 else
17090 for (int i = 1, e = Mask.size(); i < e; i += 2) {
17091 if (ShuffV[i] < 0)
17092 continue;
17093 // If element from non-splat is undef, pick first element from splat.
17094 ShuffV[i] = ShuffV[i - 1] >= 0 ? ShuffV[i - 1] - NumElts : 0;
17095 }
17096 }
17097
17098 // If the RHS has undefs, we need to remove them since we may have created
17099 // a shuffle that adds those instead of the splat value.
17100 SDValue SplatVal =
17101 cast<BuildVectorSDNode>(Val: TheSplat.getNode())->getSplatValue();
17102 TheSplat = DAG.getSplatBuildVector(VT: TheSplat.getValueType(), DL: dl, Op: SplatVal);
17103
17104 if (IsLittleEndian)
17105 RHS = TheSplat;
17106 else
17107 LHS = TheSplat;
17108 return DAG.getVectorShuffle(VT: SVN->getValueType(ResNo: 0), dl, N1: LHS, N2: RHS, Mask: ShuffV);
17109}
17110
17111SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
17112 LSBaseSDNode *LSBase,
17113 DAGCombinerInfo &DCI) const {
17114 assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
17115 "Not a reverse memop pattern!");
17116
17117 auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
17118 auto Mask = SVN->getMask();
17119 int i = 0;
17120 auto I = Mask.rbegin();
17121 auto E = Mask.rend();
17122
17123 for (; I != E; ++I) {
17124 if (*I != i)
17125 return false;
17126 i++;
17127 }
17128 return true;
17129 };
17130
17131 SelectionDAG &DAG = DCI.DAG;
17132 EVT VT = SVN->getValueType(ResNo: 0);
17133
17134 if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
17135 return SDValue();
17136
17137 // Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
17138 // See comment in PPCVSXSwapRemoval.cpp.
17139 // It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
17140 if (!Subtarget.hasP9Vector())
17141 return SDValue();
17142
17143 if(!IsElementReverse(SVN))
17144 return SDValue();
17145
17146 if (LSBase->getOpcode() == ISD::LOAD) {
17147 // If the load return value 0 has more than one user except the
17148 // shufflevector instruction, it is not profitable to replace the
17149 // shufflevector with a reverse load.
17150 for (SDUse &Use : LSBase->uses())
17151 if (Use.getResNo() == 0 &&
17152 Use.getUser()->getOpcode() != ISD::VECTOR_SHUFFLE)
17153 return SDValue();
17154
17155 SDLoc dl(LSBase);
17156 SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
17157 return DAG.getMemIntrinsicNode(
17158 Opcode: PPCISD::LOAD_VEC_BE, dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Other), Ops: LoadOps,
17159 MemVT: LSBase->getMemoryVT(), MMO: LSBase->getMemOperand());
17160 }
17161
17162 if (LSBase->getOpcode() == ISD::STORE) {
17163 // If there are other uses of the shuffle, the swap cannot be avoided.
17164 // Forcing the use of an X-Form (since swapped stores only have
17165 // X-Forms) without removing the swap is unprofitable.
17166 if (!SVN->hasOneUse())
17167 return SDValue();
17168
17169 SDLoc dl(LSBase);
17170 SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(Num: 0),
17171 LSBase->getBasePtr()};
17172 return DAG.getMemIntrinsicNode(
17173 Opcode: PPCISD::STORE_VEC_BE, dl, VTList: DAG.getVTList(VT: MVT::Other), Ops: StoreOps,
17174 MemVT: LSBase->getMemoryVT(), MMO: LSBase->getMemOperand());
17175 }
17176
17177 llvm_unreachable("Expected a load or store node here");
17178}
17179
17180static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
17181 unsigned IntrinsicID = Intrin.getConstantOperandVal(i: 1);
17182 if (IntrinsicID == Intrinsic::ppc_stdcx)
17183 StoreWidth = 8;
17184 else if (IntrinsicID == Intrinsic::ppc_stwcx)
17185 StoreWidth = 4;
17186 else if (IntrinsicID == Intrinsic::ppc_sthcx)
17187 StoreWidth = 2;
17188 else if (IntrinsicID == Intrinsic::ppc_stbcx)
17189 StoreWidth = 1;
17190 else
17191 return false;
17192 return true;
17193}
17194
17195static SDValue DAGCombineAddc(SDNode *N,
17196 llvm::PPCTargetLowering::DAGCombinerInfo &DCI) {
17197 if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(Value: 1)) {
17198 // (ADDC (ADDE 0, 0, C), -1) -> C
17199 SDValue LHS = N->getOperand(Num: 0);
17200 SDValue RHS = N->getOperand(Num: 1);
17201 if (LHS->getOpcode() == PPCISD::ADDE &&
17202 isNullConstant(V: LHS->getOperand(Num: 0)) &&
17203 isNullConstant(V: LHS->getOperand(Num: 1)) && isAllOnesConstant(V: RHS)) {
17204 return DCI.CombineTo(N, Res0: SDValue(N, 0), Res1: LHS->getOperand(Num: 2));
17205 }
17206 }
17207 return SDValue();
17208}
17209
17210// Optimize zero-extension of setcc when the compared value is known to be 0
17211// or 1.
17212//
17213// Pattern: zext(setcc(Value, 0, seteq/setne)) where Value is 0 or 1
17214// -> zext(xor(Value, 1)) for seteq
17215// -> zext(Value) for setne
17216//
17217// This optimization avoids the i32 -> i1 -> i32/i64 conversion sequence
17218// by keeping the value in its original i32 type throughout.
17219//
17220// Example:
17221// Before: zext(setcc(test_data_class(...), 0, seteq))
17222// // test_data_class returns 0 or 1 in i32
17223// // setcc converts i32 -> i1
17224// // zext converts i1 -> i64
17225// After: zext(xor(test_data_class(...), 1))
17226// // Stays in i32, then extends to i64
17227//
17228// This is beneficial because:
17229// 1. Eliminates the setcc instruction
17230// 2. Avoids i32 -> i1 truncation
17231// 3. Keeps computation in native integer width
17232
17233static SDValue combineZextSetccWithZero(SDNode *N, SelectionDAG &DAG) {
17234 // Check if this is a zero_extend
17235 if (N->getOpcode() != ISD::ZERO_EXTEND)
17236 return SDValue();
17237
17238 SDValue Src = N->getOperand(Num: 0);
17239
17240 // Check if the source is a setcc
17241 if (Src.getOpcode() != ISD::SETCC)
17242 return SDValue();
17243
17244 SDValue LHS = Src.getOperand(i: 0);
17245 SDValue RHS = Src.getOperand(i: 1);
17246 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Src.getOperand(i: 2))->get();
17247
17248 if (!isNullConstant(V: RHS) && !isNullConstant(V: LHS))
17249 return SDValue();
17250
17251 SDValue NonNullConstant = isNullConstant(V: RHS) ? LHS : RHS;
17252
17253 auto isZeroOrOne = [=](SDValue &V) {
17254 if (V.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17255 V.getConstantOperandVal(i: 0) == Intrinsic::ppc_test_data_class)
17256 return true;
17257 return false;
17258 };
17259
17260 if (!isZeroOrOne(NonNullConstant))
17261 return SDValue();
17262
17263 // Check for pattern: zext(setcc (Value), 0, seteq)) or
17264 // zext(setcc (Value), 0, setne))
17265 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
17266 // Replace with: zext(xor(Value, 1)) for seteq
17267 // or: zext(Value) for setne
17268 // This keeps the value in i32 instead of converting to i1
17269 SDLoc DL(N);
17270 EVT VType = N->getValueType(ResNo: 0);
17271 SDValue NewNonNullConstant = DAG.getZExtOrTrunc(Op: NonNullConstant, DL, VT: VType);
17272
17273 if (CC == ISD::SETNE)
17274 return NewNonNullConstant;
17275
17276 SDValue One = DAG.getConstant(Val: 1, DL, VT: VType);
17277 return DAG.getNode(Opcode: ISD::XOR, DL, VT: VType, N1: NewNonNullConstant, N2: One);
17278 }
17279
17280 return SDValue();
17281}
17282
17283// Combine XOR patterns with SELECT_CC_I4/I8, for Example:
17284// 1. XOR(SELECT_CC_I4(cond, 1, 0, cc), 1) -> SELECT_CC_I4(cond, 0, 1, cc)
17285// 2. XOR(ZEXT(SELECT_CC_I4(cond, 1, 0, cc)), 1) -> SELECT_CC_I4/I8(cond, 0,
17286// 1, cc))
17287// 3. XOR(ANYEXT(SELECT_CC_I4(cond, 1, 0, cc)), 1) -> SELECT_CC_I4/I8(cond,
17288// 0, 1, cc))
17289// 4. etc
17290static SDValue combineXorSelectCC(SDNode *N, SelectionDAG &DAG) {
17291 assert(N->getOpcode() == ISD::XOR && "Expected XOR node");
17292
17293 EVT XorVT = N->getValueType(ResNo: 0);
17294 if ((XorVT != MVT::i32 && XorVT != MVT::i64))
17295 return SDValue();
17296
17297 SDValue LHS = N->getOperand(Num: 0);
17298 SDValue RHS = N->getOperand(Num: 1);
17299
17300 // Check for XOR with constant 1
17301 ConstantSDNode *XorConst = dyn_cast<ConstantSDNode>(Val&: RHS);
17302 if (!XorConst || !XorConst->isOne()) {
17303 XorConst = dyn_cast<ConstantSDNode>(Val&: LHS);
17304 if (!XorConst || !XorConst->isOne())
17305 return SDValue();
17306 // Swap so LHS is the SELECT_CC_I4 (or extension) and RHS is the constant
17307 std::swap(a&: LHS, b&: RHS);
17308 }
17309
17310 // Check if LHS has only one use
17311 if (!LHS.hasOneUse())
17312 return SDValue();
17313
17314 // Handle extensions: ZEXT, ANYEXT
17315 SDValue SelectNode = LHS;
17316
17317 if (LHS.getOpcode() == ISD::ZERO_EXTEND ||
17318 LHS.getOpcode() == ISD::ANY_EXTEND) {
17319 SelectNode = LHS.getOperand(i: 0);
17320
17321 // Check if the extension input has only one use
17322 if (!SelectNode.hasOneUse())
17323 return SDValue();
17324 }
17325
17326 // Check if SelectNode is a MachineSDNode with SELECT_CC_I4/I8 opcode
17327 if (!SelectNode.isMachineOpcode())
17328 return SDValue();
17329
17330 unsigned MachineOpc = SelectNode.getMachineOpcode();
17331
17332 // Handle both SELECT_CC_I4 and SELECT_CC_I8
17333 if (MachineOpc != PPC::SELECT_CC_I4 && MachineOpc != PPC::SELECT_CC_I8)
17334 return SDValue();
17335
17336 // SELECT_CC_I4 operands: (cond, true_val, false_val, bropc)
17337 if (SelectNode.getNumOperands() != 4)
17338 return SDValue();
17339
17340 ConstantSDNode *ConstOp1 = dyn_cast<ConstantSDNode>(Val: SelectNode.getOperand(i: 1));
17341 ConstantSDNode *ConstOp2 = dyn_cast<ConstantSDNode>(Val: SelectNode.getOperand(i: 2));
17342
17343 if (!ConstOp1 || !ConstOp2)
17344 return SDValue();
17345
17346 // Only optimize if operands are {0, 1} or {1, 0}
17347 if (!((ConstOp1->isOne() && ConstOp2->isZero()) ||
17348 (ConstOp1->isZero() && ConstOp2->isOne())))
17349 return SDValue();
17350
17351 // Pattern matched! Create new SELECT_CC with swapped 0/1 operands to
17352 // eliminate XOR. If original was SELECT_CC(cond, 1, 0, pred), create
17353 // SELECT_CC(cond, 0, 1, pred). If original was SELECT_CC(cond, 0, 1, pred),
17354 // create SELECT_CC(cond, 1, 0, pred).
17355 SDLoc DL(N);
17356 MachineOpc = (XorVT == MVT::i32) ? PPC::SELECT_CC_I4 : PPC::SELECT_CC_I8;
17357
17358 bool ConstOp1IsOne = ConstOp1->isOne();
17359 return SDValue(
17360 DAG.getMachineNode(Opcode: MachineOpc, dl: DL, VT: XorVT,
17361 Ops: {SelectNode.getOperand(i: 0),
17362 DAG.getConstant(Val: ConstOp1IsOne ? 0 : 1, DL, VT: XorVT),
17363 DAG.getConstant(Val: ConstOp1IsOne ? 1 : 0, DL, VT: XorVT),
17364 SelectNode.getOperand(i: 3)}),
17365 0);
17366}
17367
17368SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
17369 DAGCombinerInfo &DCI) const {
17370 SelectionDAG &DAG = DCI.DAG;
17371 SDLoc dl(N);
17372 switch (N->getOpcode()) {
17373 default: break;
17374 case ISD::ADD:
17375 return combineADD(N, DCI);
17376 case ISD::AND: {
17377 // We don't want (and (zext (shift...)), C) if C fits in the width of the
17378 // original input as that will prevent us from selecting optimal rotates.
17379 // This only matters if the input to the extend is i32 widened to i64.
17380 SDValue Op1 = N->getOperand(Num: 0);
17381 SDValue Op2 = N->getOperand(Num: 1);
17382 if ((Op1.getOpcode() != ISD::ZERO_EXTEND &&
17383 Op1.getOpcode() != ISD::ANY_EXTEND) ||
17384 !isa<ConstantSDNode>(Val: Op2) || N->getValueType(ResNo: 0) != MVT::i64 ||
17385 Op1.getOperand(i: 0).getValueType() != MVT::i32)
17386 break;
17387 SDValue NarrowOp = Op1.getOperand(i: 0);
17388 if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL &&
17389 NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)
17390 break;
17391
17392 uint64_t Imm = Op2->getAsZExtVal();
17393 // Make sure that the constant is narrow enough to fit in the narrow type.
17394 if (!isUInt<32>(x: Imm))
17395 break;
17396 SDValue ConstOp = DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32);
17397 SDValue NarrowAnd = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: NarrowOp, N2: ConstOp);
17398 return DAG.getZExtOrTrunc(Op: NarrowAnd, DL: dl, VT: N->getValueType(ResNo: 0));
17399 }
17400 case ISD::XOR: {
17401 // Optimize XOR(ISEL(1,0,CR), 1) -> ISEL(0,1,CR)
17402 if (SDValue V = combineXorSelectCC(N, DAG))
17403 return V;
17404 break;
17405 }
17406 case ISD::SHL:
17407 return combineSHL(N, DCI);
17408 case ISD::SRA:
17409 return combineSRA(N, DCI);
17410 case ISD::SRL:
17411 return combineSRL(N, DCI);
17412 case ISD::MUL:
17413 return combineMUL(N, DCI);
17414 case ISD::FMA:
17415 case PPCISD::FNMSUB:
17416 return combineFMALike(N, DCI);
17417 case PPCISD::SHL:
17418 if (isNullConstant(V: N->getOperand(Num: 0))) // 0 << V -> 0.
17419 return N->getOperand(Num: 0);
17420 break;
17421 case PPCISD::SRL:
17422 if (isNullConstant(V: N->getOperand(Num: 0))) // 0 >>u V -> 0.
17423 return N->getOperand(Num: 0);
17424 break;
17425 case PPCISD::SRA:
17426 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0))) {
17427 if (C->isZero() || // 0 >>s V -> 0.
17428 C->isAllOnes()) // -1 >>s V -> -1.
17429 return N->getOperand(Num: 0);
17430 }
17431 break;
17432 case ISD::ZERO_EXTEND:
17433 if (SDValue RetV = combineZextSetccWithZero(N, DAG&: DCI.DAG))
17434 return RetV;
17435 [[fallthrough]];
17436 case ISD::SIGN_EXTEND:
17437 case ISD::ANY_EXTEND:
17438 return DAGCombineExtBoolTrunc(N, DCI);
17439 case ISD::TRUNCATE:
17440 return combineTRUNCATE(N, DCI);
17441 case ISD::SETCC:
17442 if (SDValue CSCC = combineSetCC(N, DCI))
17443 return CSCC;
17444 [[fallthrough]];
17445 case ISD::SELECT_CC:
17446 return DAGCombineTruncBoolExt(N, DCI);
17447 case ISD::SINT_TO_FP:
17448 case ISD::UINT_TO_FP:
17449 return combineFPToIntToFP(N, DCI);
17450 case ISD::VECTOR_SHUFFLE:
17451 if (ISD::isNormalLoad(N: N->getOperand(Num: 0).getNode())) {
17452 LSBaseSDNode* LSBase = cast<LSBaseSDNode>(Val: N->getOperand(Num: 0));
17453 return combineVReverseMemOP(SVN: cast<ShuffleVectorSDNode>(Val: N), LSBase, DCI);
17454 }
17455 return combineVectorShuffle(SVN: cast<ShuffleVectorSDNode>(Val: N), DAG&: DCI.DAG);
17456 case ISD::STORE: {
17457
17458 EVT Op1VT = N->getOperand(Num: 1).getValueType();
17459 unsigned Opcode = N->getOperand(Num: 1).getOpcode();
17460
17461 if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
17462 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) {
17463 SDValue Val = combineStoreFPToInt(N, DCI);
17464 if (Val)
17465 return Val;
17466 }
17467
17468 if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
17469 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1));
17470 SDValue Val= combineVReverseMemOP(SVN, LSBase: cast<LSBaseSDNode>(Val: N), DCI);
17471 if (Val)
17472 return Val;
17473 }
17474
17475 // Turn STORE (BSWAP) -> sthbrx/stwbrx.
17476 if (cast<StoreSDNode>(Val: N)->isUnindexed() && Opcode == ISD::BSWAP &&
17477 N->getOperand(Num: 1).getNode()->hasOneUse() &&
17478 (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
17479 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
17480
17481 // STBRX can only handle simple types and it makes no sense to store less
17482 // two bytes in byte-reversed order.
17483 EVT mVT = cast<StoreSDNode>(Val: N)->getMemoryVT();
17484 if (mVT.isExtended() || mVT.getSizeInBits() < 16)
17485 break;
17486
17487 SDValue BSwapOp = N->getOperand(Num: 1).getOperand(i: 0);
17488 // Do an any-extend to 32-bits if this is a half-word input.
17489 if (BSwapOp.getValueType() == MVT::i16)
17490 BSwapOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: BSwapOp);
17491
17492 // If the type of BSWAP operand is wider than stored memory width
17493 // it need to be shifted to the right side before STBRX.
17494 if (Op1VT.bitsGT(VT: mVT)) {
17495 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
17496 BSwapOp = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op1VT, N1: BSwapOp,
17497 N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
17498 // Need to truncate if this is a bswap of i64 stored as i32/i16.
17499 if (Op1VT == MVT::i64)
17500 BSwapOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: BSwapOp);
17501 }
17502
17503 SDValue Ops[] = {
17504 N->getOperand(Num: 0), BSwapOp, N->getOperand(Num: 2), DAG.getValueType(mVT)
17505 };
17506 return
17507 DAG.getMemIntrinsicNode(Opcode: PPCISD::STBRX, dl, VTList: DAG.getVTList(VT: MVT::Other),
17508 Ops, MemVT: cast<StoreSDNode>(Val: N)->getMemoryVT(),
17509 MMO: cast<StoreSDNode>(Val: N)->getMemOperand());
17510 }
17511
17512 // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
17513 // So it can increase the chance of CSE constant construction.
17514 if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
17515 isa<ConstantSDNode>(Val: N->getOperand(Num: 1)) && Op1VT == MVT::i32) {
17516 // Need to sign-extended to 64-bits to handle negative values.
17517 EVT MemVT = cast<StoreSDNode>(Val: N)->getMemoryVT();
17518 uint64_t Val64 = SignExtend64(X: N->getConstantOperandVal(Num: 1),
17519 B: MemVT.getSizeInBits());
17520 SDValue Const64 = DAG.getConstant(Val: Val64, DL: dl, VT: MVT::i64);
17521
17522 auto *ST = cast<StoreSDNode>(Val: N);
17523 SDValue NewST = DAG.getStore(Chain: ST->getChain(), dl, Val: Const64,
17524 Ptr: ST->getBasePtr(), Offset: ST->getOffset(), SVT: MemVT,
17525 MMO: ST->getMemOperand(), AM: ST->getAddressingMode(),
17526 /*IsTruncating=*/true);
17527 // Note we use CombineTo here to prevent DAGCombiner from visiting the
17528 // new store which will change the constant by removing non-demanded bits.
17529 return ST->isUnindexed()
17530 ? DCI.CombineTo(N, Res: NewST, /*AddTo=*/false)
17531 : DCI.CombineTo(N, Res0: NewST, Res1: NewST.getValue(R: 1), /*AddTo=*/false);
17532 }
17533
17534 // For little endian, VSX stores require generating xxswapd/lxvd2x.
17535 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
17536 if (Op1VT.isSimple()) {
17537 MVT StoreVT = Op1VT.getSimpleVT();
17538 if (Subtarget.needsSwapsForVSXMemOps() &&
17539 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
17540 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
17541 return expandVSXStoreForLE(N, DCI);
17542 }
17543 break;
17544 }
17545 case ISD::LOAD: {
17546 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
17547 EVT VT = LD->getValueType(ResNo: 0);
17548
17549 // For little endian, VSX loads require generating lxvd2x/xxswapd.
17550 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
17551 if (VT.isSimple()) {
17552 MVT LoadVT = VT.getSimpleVT();
17553 if (Subtarget.needsSwapsForVSXMemOps() &&
17554 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
17555 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
17556 return expandVSXLoadForLE(N, DCI);
17557 }
17558
17559 // We sometimes end up with a 64-bit integer load, from which we extract
17560 // two single-precision floating-point numbers. This happens with
17561 // std::complex<float>, and other similar structures, because of the way we
17562 // canonicalize structure copies. However, if we lack direct moves,
17563 // then the final bitcasts from the extracted integer values to the
17564 // floating-point numbers turn into store/load pairs. Even with direct moves,
17565 // just loading the two floating-point numbers is likely better.
17566 auto ReplaceTwoFloatLoad = [&]() {
17567 if (VT != MVT::i64)
17568 return false;
17569
17570 if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
17571 LD->isVolatile())
17572 return false;
17573
17574 // We're looking for a sequence like this:
17575 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
17576 // t16: i64 = srl t13, Constant:i32<32>
17577 // t17: i32 = truncate t16
17578 // t18: f32 = bitcast t17
17579 // t19: i32 = truncate t13
17580 // t20: f32 = bitcast t19
17581
17582 if (!LD->hasNUsesOfValue(NUses: 2, Value: 0))
17583 return false;
17584
17585 auto UI = LD->user_begin();
17586 while (UI.getUse().getResNo() != 0) ++UI;
17587 SDNode *Trunc = *UI++;
17588 while (UI.getUse().getResNo() != 0) ++UI;
17589 SDNode *RightShift = *UI;
17590 if (Trunc->getOpcode() != ISD::TRUNCATE)
17591 std::swap(a&: Trunc, b&: RightShift);
17592
17593 if (Trunc->getOpcode() != ISD::TRUNCATE ||
17594 Trunc->getValueType(ResNo: 0) != MVT::i32 ||
17595 !Trunc->hasOneUse())
17596 return false;
17597 if (RightShift->getOpcode() != ISD::SRL ||
17598 !isa<ConstantSDNode>(Val: RightShift->getOperand(Num: 1)) ||
17599 RightShift->getConstantOperandVal(Num: 1) != 32 ||
17600 !RightShift->hasOneUse())
17601 return false;
17602
17603 SDNode *Trunc2 = *RightShift->user_begin();
17604 if (Trunc2->getOpcode() != ISD::TRUNCATE ||
17605 Trunc2->getValueType(ResNo: 0) != MVT::i32 ||
17606 !Trunc2->hasOneUse())
17607 return false;
17608
17609 SDNode *Bitcast = *Trunc->user_begin();
17610 SDNode *Bitcast2 = *Trunc2->user_begin();
17611
17612 if (Bitcast->getOpcode() != ISD::BITCAST ||
17613 Bitcast->getValueType(ResNo: 0) != MVT::f32)
17614 return false;
17615 if (Bitcast2->getOpcode() != ISD::BITCAST ||
17616 Bitcast2->getValueType(ResNo: 0) != MVT::f32)
17617 return false;
17618
17619 if (Subtarget.isLittleEndian())
17620 std::swap(a&: Bitcast, b&: Bitcast2);
17621
17622 // Bitcast has the second float (in memory-layout order) and Bitcast2
17623 // has the first one.
17624
17625 SDValue BasePtr = LD->getBasePtr();
17626 if (LD->isIndexed()) {
17627 assert(LD->getAddressingMode() == ISD::PRE_INC &&
17628 "Non-pre-inc AM on PPC?");
17629 BasePtr =
17630 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
17631 N2: LD->getOffset());
17632 }
17633
17634 auto MMOFlags =
17635 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
17636 SDValue FloatLoad = DAG.getLoad(VT: MVT::f32, dl, Chain: LD->getChain(), Ptr: BasePtr,
17637 PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign(),
17638 MMOFlags, AAInfo: LD->getAAInfo());
17639 SDValue AddPtr =
17640 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(),
17641 N1: BasePtr, N2: DAG.getIntPtrConstant(Val: 4, DL: dl));
17642 SDValue FloatLoad2 = DAG.getLoad(
17643 VT: MVT::f32, dl, Chain: SDValue(FloatLoad.getNode(), 1), Ptr: AddPtr,
17644 PtrInfo: LD->getPointerInfo().getWithOffset(O: 4),
17645 Alignment: commonAlignment(A: LD->getAlign(), Offset: 4), MMOFlags, AAInfo: LD->getAAInfo());
17646
17647 if (LD->isIndexed()) {
17648 // Note that DAGCombine should re-form any pre-increment load(s) from
17649 // what is produced here if that makes sense.
17650 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: BasePtr);
17651 }
17652
17653 DCI.CombineTo(N: Bitcast2, Res: FloatLoad);
17654 DCI.CombineTo(N: Bitcast, Res: FloatLoad2);
17655
17656 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, LD->isIndexed() ? 2 : 1),
17657 To: SDValue(FloatLoad2.getNode(), 1));
17658 return true;
17659 };
17660
17661 if (ReplaceTwoFloatLoad())
17662 return SDValue(N, 0);
17663
17664 EVT MemVT = LD->getMemoryVT();
17665 Type *Ty = MemVT.getTypeForEVT(Context&: *DAG.getContext());
17666 Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
17667 if (LD->isUnindexed() && VT.isVector() &&
17668 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
17669 // P8 and later hardware should just use LOAD.
17670 !Subtarget.hasP8Vector() &&
17671 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
17672 VT == MVT::v4f32))) &&
17673 LD->getAlign() < ABIAlignment) {
17674 // This is a type-legal unaligned Altivec load.
17675 SDValue Chain = LD->getChain();
17676 SDValue Ptr = LD->getBasePtr();
17677 bool isLittleEndian = Subtarget.isLittleEndian();
17678
17679 // This implements the loading of unaligned vectors as described in
17680 // the venerable Apple Velocity Engine overview. Specifically:
17681 // https://developer.apple.com/hardwaredrivers/ve/alignment.html
17682 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
17683 //
17684 // The general idea is to expand a sequence of one or more unaligned
17685 // loads into an alignment-based permutation-control instruction (lvsl
17686 // or lvsr), a series of regular vector loads (which always truncate
17687 // their input address to an aligned address), and a series of
17688 // permutations. The results of these permutations are the requested
17689 // loaded values. The trick is that the last "extra" load is not taken
17690 // from the address you might suspect (sizeof(vector) bytes after the
17691 // last requested load), but rather sizeof(vector) - 1 bytes after the
17692 // last requested vector. The point of this is to avoid a page fault if
17693 // the base address happened to be aligned. This works because if the
17694 // base address is aligned, then adding less than a full vector length
17695 // will cause the last vector in the sequence to be (re)loaded.
17696 // Otherwise, the next vector will be fetched as you might suspect was
17697 // necessary.
17698
17699 // We might be able to reuse the permutation generation from
17700 // a different base address offset from this one by an aligned amount.
17701 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
17702 // optimization later.
17703 Intrinsic::ID Intr, IntrLD, IntrPerm;
17704 MVT PermCntlTy, PermTy, LDTy;
17705 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
17706 : Intrinsic::ppc_altivec_lvsl;
17707 IntrLD = Intrinsic::ppc_altivec_lvx;
17708 IntrPerm = Intrinsic::ppc_altivec_vperm;
17709 PermCntlTy = MVT::v16i8;
17710 PermTy = MVT::v4i32;
17711 LDTy = MVT::v4i32;
17712
17713 SDValue PermCntl = BuildIntrinsicOp(IID: Intr, Op: Ptr, DAG, dl, DestVT: PermCntlTy);
17714
17715 // Create the new MMO for the new base load. It is like the original MMO,
17716 // but represents an area in memory almost twice the vector size centered
17717 // on the original address. If the address is unaligned, we might start
17718 // reading up to (sizeof(vector)-1) bytes below the address of the
17719 // original unaligned load.
17720 MachineFunction &MF = DAG.getMachineFunction();
17721 MachineMemOperand *BaseMMO =
17722 MF.getMachineMemOperand(MMO: LD->getMemOperand(),
17723 Offset: -(int64_t)MemVT.getStoreSize()+1,
17724 Size: 2*MemVT.getStoreSize()-1);
17725
17726 // Create the new base load.
17727 SDValue LDXIntID =
17728 DAG.getTargetConstant(Val: IntrLD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()));
17729 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
17730 SDValue BaseLoad =
17731 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
17732 VTList: DAG.getVTList(VT1: PermTy, VT2: MVT::Other),
17733 Ops: BaseLoadOps, MemVT: LDTy, MMO: BaseMMO);
17734
17735 // Note that the value of IncOffset (which is provided to the next
17736 // load's pointer info offset value, and thus used to calculate the
17737 // alignment), and the value of IncValue (which is actually used to
17738 // increment the pointer value) are different! This is because we
17739 // require the next load to appear to be aligned, even though it
17740 // is actually offset from the base pointer by a lesser amount.
17741 int IncOffset = VT.getSizeInBits() / 8;
17742 int IncValue = IncOffset;
17743
17744 // Walk (both up and down) the chain looking for another load at the real
17745 // (aligned) offset (the alignment of the other load does not matter in
17746 // this case). If found, then do not use the offset reduction trick, as
17747 // that will prevent the loads from being later combined (as they would
17748 // otherwise be duplicates).
17749 if (!findConsecutiveLoad(LD, DAG))
17750 --IncValue;
17751
17752 SDValue Increment =
17753 DAG.getConstant(Val: IncValue, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()));
17754 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Ptr.getValueType(), N1: Ptr, N2: Increment);
17755
17756 MachineMemOperand *ExtraMMO =
17757 MF.getMachineMemOperand(MMO: LD->getMemOperand(),
17758 Offset: 1, Size: 2*MemVT.getStoreSize()-1);
17759 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
17760 SDValue ExtraLoad =
17761 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
17762 VTList: DAG.getVTList(VT1: PermTy, VT2: MVT::Other),
17763 Ops: ExtraLoadOps, MemVT: LDTy, MMO: ExtraMMO);
17764
17765 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other,
17766 N1: BaseLoad.getValue(R: 1), N2: ExtraLoad.getValue(R: 1));
17767
17768 // Because vperm has a big-endian bias, we must reverse the order
17769 // of the input vectors and complement the permute control vector
17770 // when generating little endian code. We have already handled the
17771 // latter by using lvsr instead of lvsl, so just reverse BaseLoad
17772 // and ExtraLoad here.
17773 SDValue Perm;
17774 if (isLittleEndian)
17775 Perm = BuildIntrinsicOp(IID: IntrPerm,
17776 Op0: ExtraLoad, Op1: BaseLoad, Op2: PermCntl, DAG, dl);
17777 else
17778 Perm = BuildIntrinsicOp(IID: IntrPerm,
17779 Op0: BaseLoad, Op1: ExtraLoad, Op2: PermCntl, DAG, dl);
17780
17781 if (VT != PermTy)
17782 Perm = Subtarget.hasAltivec()
17783 ? DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Perm)
17784 : DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: Perm,
17785 N2: DAG.getTargetConstant(Val: 1, DL: dl, VT: MVT::i64));
17786 // second argument is 1 because this rounding
17787 // is always exact.
17788
17789 // The output of the permutation is our loaded result, the TokenFactor is
17790 // our new chain.
17791 DCI.CombineTo(N, Res0: Perm, Res1: TF);
17792 return SDValue(N, 0);
17793 }
17794 }
17795 break;
17796 case ISD::INTRINSIC_WO_CHAIN: {
17797 bool isLittleEndian = Subtarget.isLittleEndian();
17798 unsigned IID = N->getConstantOperandVal(Num: 0);
17799 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
17800 : Intrinsic::ppc_altivec_lvsl);
17801 if (IID == Intr && N->getOperand(Num: 1)->getOpcode() == ISD::ADD) {
17802 SDValue Add = N->getOperand(Num: 1);
17803
17804 int Bits = 4 /* 16 byte alignment */;
17805
17806 if (DAG.MaskedValueIsZero(Op: Add->getOperand(Num: 1),
17807 Mask: APInt::getAllOnes(numBits: Bits /* alignment */)
17808 .zext(width: Add.getScalarValueSizeInBits()))) {
17809 SDNode *BasePtr = Add->getOperand(Num: 0).getNode();
17810 for (SDNode *U : BasePtr->users()) {
17811 if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17812 U->getConstantOperandVal(Num: 0) == IID) {
17813 // We've found another LVSL/LVSR, and this address is an aligned
17814 // multiple of that one. The results will be the same, so use the
17815 // one we've just found instead.
17816
17817 return SDValue(U, 0);
17818 }
17819 }
17820 }
17821
17822 if (isa<ConstantSDNode>(Val: Add->getOperand(Num: 1))) {
17823 SDNode *BasePtr = Add->getOperand(Num: 0).getNode();
17824 for (SDNode *U : BasePtr->users()) {
17825 if (U->getOpcode() == ISD::ADD &&
17826 isa<ConstantSDNode>(Val: U->getOperand(Num: 1)) &&
17827 (Add->getConstantOperandVal(Num: 1) - U->getConstantOperandVal(Num: 1)) %
17828 (1ULL << Bits) ==
17829 0) {
17830 SDNode *OtherAdd = U;
17831 for (SDNode *V : OtherAdd->users()) {
17832 if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17833 V->getConstantOperandVal(Num: 0) == IID) {
17834 return SDValue(V, 0);
17835 }
17836 }
17837 }
17838 }
17839 }
17840 }
17841
17842 // Combine vmaxsw/h/b(a, a's negation) to abs(a)
17843 // Expose the vabsduw/h/b opportunity for down stream
17844 if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
17845 (IID == Intrinsic::ppc_altivec_vmaxsw ||
17846 IID == Intrinsic::ppc_altivec_vmaxsh ||
17847 IID == Intrinsic::ppc_altivec_vmaxsb)) {
17848 SDValue V1 = N->getOperand(Num: 1);
17849 SDValue V2 = N->getOperand(Num: 2);
17850 if ((V1.getSimpleValueType() == MVT::v4i32 ||
17851 V1.getSimpleValueType() == MVT::v8i16 ||
17852 V1.getSimpleValueType() == MVT::v16i8) &&
17853 V1.getSimpleValueType() == V2.getSimpleValueType()) {
17854 // (0-a, a)
17855 if (V1.getOpcode() == ISD::SUB &&
17856 ISD::isBuildVectorAllZeros(N: V1.getOperand(i: 0).getNode()) &&
17857 V1.getOperand(i: 1) == V2) {
17858 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V2.getValueType(), Operand: V2);
17859 }
17860 // (a, 0-a)
17861 if (V2.getOpcode() == ISD::SUB &&
17862 ISD::isBuildVectorAllZeros(N: V2.getOperand(i: 0).getNode()) &&
17863 V2.getOperand(i: 1) == V1) {
17864 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V1.getValueType(), Operand: V1);
17865 }
17866 // (x-y, y-x)
17867 if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
17868 V1.getOperand(i: 0) == V2.getOperand(i: 1) &&
17869 V1.getOperand(i: 1) == V2.getOperand(i: 0)) {
17870 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V1.getValueType(), Operand: V1);
17871 }
17872 }
17873 }
17874 }
17875
17876 break;
17877 case ISD::INTRINSIC_W_CHAIN:
17878 switch (N->getConstantOperandVal(Num: 1)) {
17879 default:
17880 break;
17881 case Intrinsic::ppc_altivec_vsum4sbs:
17882 case Intrinsic::ppc_altivec_vsum4shs:
17883 case Intrinsic::ppc_altivec_vsum4ubs: {
17884 // These sum-across intrinsics only have a chain due to the side effect
17885 // that they may set the SAT bit. If we know the SAT bit will not be set
17886 // for some inputs, we can replace any uses of their chain with the
17887 // input chain.
17888 if (BuildVectorSDNode *BVN =
17889 dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 3))) {
17890 APInt APSplatBits, APSplatUndef;
17891 unsigned SplatBitSize;
17892 bool HasAnyUndefs;
17893 bool BVNIsConstantSplat = BVN->isConstantSplat(
17894 SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize, HasAnyUndefs, MinSplatBits: 0,
17895 isBigEndian: !Subtarget.isLittleEndian());
17896 // If the constant splat vector is 0, the SAT bit will not be set.
17897 if (BVNIsConstantSplat && APSplatBits == 0)
17898 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 1), To: N->getOperand(Num: 0));
17899 }
17900 return SDValue();
17901 }
17902 case Intrinsic::ppc_vsx_lxvw4x:
17903 case Intrinsic::ppc_vsx_lxvd2x:
17904 // For little endian, VSX loads require generating lxvd2x/xxswapd.
17905 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
17906 if (Subtarget.needsSwapsForVSXMemOps())
17907 return expandVSXLoadForLE(N, DCI);
17908 break;
17909 }
17910 break;
17911 case ISD::INTRINSIC_VOID:
17912 // For little endian, VSX stores require generating xxswapd/stxvd2x.
17913 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
17914 if (Subtarget.needsSwapsForVSXMemOps()) {
17915 switch (N->getConstantOperandVal(Num: 1)) {
17916 default:
17917 break;
17918 case Intrinsic::ppc_vsx_stxvw4x:
17919 case Intrinsic::ppc_vsx_stxvd2x:
17920 return expandVSXStoreForLE(N, DCI);
17921 }
17922 }
17923 break;
17924 case ISD::BSWAP: {
17925 // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
17926 // For subtargets without LDBRX, we can still do better than the default
17927 // expansion even for 64-bit BSWAP (LOAD).
17928 bool Is64BitBswapOn64BitTgt =
17929 Subtarget.isPPC64() && N->getValueType(ResNo: 0) == MVT::i64;
17930 bool IsSingleUseNormalLd = ISD::isNormalLoad(N: N->getOperand(Num: 0).getNode()) &&
17931 N->getOperand(Num: 0).hasOneUse();
17932 if (IsSingleUseNormalLd &&
17933 (N->getValueType(ResNo: 0) == MVT::i32 || N->getValueType(ResNo: 0) == MVT::i16 ||
17934 (Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
17935 SDValue Load = N->getOperand(Num: 0);
17936 LoadSDNode *LD = cast<LoadSDNode>(Val&: Load);
17937 // Create the byte-swapping load.
17938 SDValue Ops[] = {
17939 LD->getChain(), // Chain
17940 LD->getBasePtr(), // Ptr
17941 DAG.getValueType(N->getValueType(ResNo: 0)) // VT
17942 };
17943 SDValue BSLoad =
17944 DAG.getMemIntrinsicNode(Opcode: PPCISD::LBRX, dl,
17945 VTList: DAG.getVTList(VT1: N->getValueType(ResNo: 0) == MVT::i64 ?
17946 MVT::i64 : MVT::i32, VT2: MVT::Other),
17947 Ops, MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
17948
17949 // If this is an i16 load, insert the truncate.
17950 SDValue ResVal = BSLoad;
17951 if (N->getValueType(ResNo: 0) == MVT::i16)
17952 ResVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i16, Operand: BSLoad);
17953
17954 // First, combine the bswap away. This makes the value produced by the
17955 // load dead.
17956 DCI.CombineTo(N, Res: ResVal);
17957
17958 // Next, combine the load away, we give it a bogus result value but a real
17959 // chain result. The result value is dead because the bswap is dead.
17960 DCI.CombineTo(N: Load.getNode(), Res0: ResVal, Res1: BSLoad.getValue(R: 1));
17961
17962 // Return N so it doesn't get rechecked!
17963 return SDValue(N, 0);
17964 }
17965 // Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
17966 // before legalization so that the BUILD_PAIR is handled correctly.
17967 if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
17968 !IsSingleUseNormalLd)
17969 return SDValue();
17970 LoadSDNode *LD = cast<LoadSDNode>(Val: N->getOperand(Num: 0));
17971
17972 // Can't split volatile or atomic loads.
17973 if (!LD->isSimple())
17974 return SDValue();
17975 SDValue BasePtr = LD->getBasePtr();
17976 SDValue Lo = DAG.getLoad(VT: MVT::i32, dl, Chain: LD->getChain(), Ptr: BasePtr,
17977 PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign());
17978 Lo = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::i32, Operand: Lo);
17979 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
17980 N2: DAG.getIntPtrConstant(Val: 4, DL: dl));
17981 MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(
17982 MMO: LD->getMemOperand(), Offset: 4, Size: 4);
17983 SDValue Hi = DAG.getLoad(VT: MVT::i32, dl, Chain: LD->getChain(), Ptr: BasePtr, MMO: NewMMO);
17984 Hi = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::i32, Operand: Hi);
17985 SDValue Res;
17986 if (Subtarget.isLittleEndian())
17987 Res = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Hi, N2: Lo);
17988 else
17989 Res = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Lo, N2: Hi);
17990 SDValue TF =
17991 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other,
17992 N1: Hi.getOperand(i: 0).getValue(R: 1), N2: Lo.getOperand(i: 0).getValue(R: 1));
17993 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: TF);
17994 return Res;
17995 }
17996 case PPCISD::VCMP:
17997 // If a VCMP_rec node already exists with exactly the same operands as this
17998 // node, use its result instead of this node (VCMP_rec computes both a CR6
17999 // and a normal output).
18000 //
18001 if (!N->getOperand(Num: 0).hasOneUse() &&
18002 !N->getOperand(Num: 1).hasOneUse() &&
18003 !N->getOperand(Num: 2).hasOneUse()) {
18004
18005 // Scan all of the users of the LHS, looking for VCMP_rec's that match.
18006 SDNode *VCMPrecNode = nullptr;
18007
18008 SDNode *LHSN = N->getOperand(Num: 0).getNode();
18009 for (SDNode *User : LHSN->users())
18010 if (User->getOpcode() == PPCISD::VCMP_rec &&
18011 User->getOperand(Num: 1) == N->getOperand(Num: 1) &&
18012 User->getOperand(Num: 2) == N->getOperand(Num: 2) &&
18013 User->getOperand(Num: 0) == N->getOperand(Num: 0)) {
18014 VCMPrecNode = User;
18015 break;
18016 }
18017
18018 // If there is no VCMP_rec node, or if the flag value has a single use,
18019 // don't transform this.
18020 if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(NUses: 0, Value: 1))
18021 break;
18022
18023 // Look at the (necessarily single) use of the flag value. If it has a
18024 // chain, this transformation is more complex. Note that multiple things
18025 // could use the value result, which we should ignore.
18026 SDNode *FlagUser = nullptr;
18027 for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
18028 FlagUser == nullptr; ++UI) {
18029 assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
18030 SDNode *User = UI->getUser();
18031 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
18032 if (User->getOperand(Num: i) == SDValue(VCMPrecNode, 1)) {
18033 FlagUser = User;
18034 break;
18035 }
18036 }
18037 }
18038
18039 // If the user is a MFOCRF instruction, we know this is safe.
18040 // Otherwise we give up for right now.
18041 if (FlagUser->getOpcode() == PPCISD::MFOCRF)
18042 return SDValue(VCMPrecNode, 0);
18043 }
18044 break;
18045 case ISD::BR_CC: {
18046 // If this is a branch on an altivec predicate comparison, lower this so
18047 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This
18048 // lowering is done pre-legalize, because the legalizer lowers the predicate
18049 // compare down to code that is difficult to reassemble.
18050 // This code also handles branches that depend on the result of a store
18051 // conditional.
18052 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 1))->get();
18053 SDValue LHS = N->getOperand(Num: 2), RHS = N->getOperand(Num: 3);
18054
18055 int CompareOpc;
18056 bool isDot;
18057
18058 if (!isa<ConstantSDNode>(Val: RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE))
18059 break;
18060
18061 // Since we are doing this pre-legalize, the RHS can be a constant of
18062 // arbitrary bitwidth which may cause issues when trying to get the value
18063 // from the underlying APInt.
18064 auto RHSAPInt = RHS->getAsAPIntVal();
18065 if (!RHSAPInt.isIntN(N: 64))
18066 break;
18067
18068 unsigned Val = RHSAPInt.getZExtValue();
18069 auto isImpossibleCompare = [&]() {
18070 // If this is a comparison against something other than 0/1, then we know
18071 // that the condition is never/always true.
18072 if (Val != 0 && Val != 1) {
18073 if (CC == ISD::SETEQ) // Cond never true, remove branch.
18074 return N->getOperand(Num: 0);
18075 // Always !=, turn it into an unconditional branch.
18076 return DAG.getNode(Opcode: ISD::BR, DL: dl, VT: MVT::Other,
18077 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 4));
18078 }
18079 return SDValue();
18080 };
18081 // Combine branches fed by store conditional instructions (st[bhwd]cx).
18082 unsigned StoreWidth = 0;
18083 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
18084 isStoreConditional(Intrin: LHS, StoreWidth)) {
18085 if (SDValue Impossible = isImpossibleCompare())
18086 return Impossible;
18087 PPC::Predicate CompOpc;
18088 // eq 0 => ne
18089 // ne 0 => eq
18090 // eq 1 => eq
18091 // ne 1 => ne
18092 if (Val == 0)
18093 CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ;
18094 else
18095 CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE;
18096
18097 SDValue Ops[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 2), LHS.getOperand(i: 3),
18098 DAG.getConstant(Val: StoreWidth, DL: dl, VT: MVT::i32)};
18099 auto *MemNode = cast<MemSDNode>(Val&: LHS);
18100 SDValue ConstSt = DAG.getMemIntrinsicNode(
18101 Opcode: PPCISD::STORE_COND, dl,
18102 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other, VT3: MVT::Glue), Ops,
18103 MemVT: MemNode->getMemoryVT(), MMO: MemNode->getMemOperand());
18104
18105 SDValue InChain;
18106 // Unchain the branch from the original store conditional.
18107 if (N->getOperand(Num: 0) == LHS.getValue(R: 1))
18108 InChain = LHS.getOperand(i: 0);
18109 else if (N->getOperand(Num: 0).getOpcode() == ISD::TokenFactor) {
18110 SmallVector<SDValue, 4> InChains;
18111 SDValue InTF = N->getOperand(Num: 0);
18112 for (int i = 0, e = InTF.getNumOperands(); i < e; i++)
18113 if (InTF.getOperand(i) != LHS.getValue(R: 1))
18114 InChains.push_back(Elt: InTF.getOperand(i));
18115 InChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: InChains);
18116 }
18117
18118 return DAG.getNode(Opcode: PPCISD::COND_BRANCH, DL: dl, VT: MVT::Other, N1: InChain,
18119 N2: DAG.getConstant(Val: CompOpc, DL: dl, VT: MVT::i32),
18120 N3: DAG.getRegister(Reg: PPC::CR0, VT: MVT::i32), N4: N->getOperand(Num: 4),
18121 N5: ConstSt.getValue(R: 2));
18122 }
18123
18124 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18125 getVectorCompareInfo(Intrin: LHS, CompareOpc, isDot, Subtarget)) {
18126 assert(isDot && "Can't compare against a vector result!");
18127
18128 if (SDValue Impossible = isImpossibleCompare())
18129 return Impossible;
18130
18131 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
18132 // Create the PPCISD altivec 'dot' comparison node.
18133 SDValue Ops[] = {
18134 LHS.getOperand(i: 2), // LHS of compare
18135 LHS.getOperand(i: 3), // RHS of compare
18136 DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32)
18137 };
18138 EVT VTs[] = { LHS.getOperand(i: 2).getValueType(), MVT::Glue };
18139 SDValue CompNode = DAG.getNode(Opcode: PPCISD::VCMP_rec, DL: dl, ResultTys: VTs, Ops);
18140
18141 // Unpack the result based on how the target uses it.
18142 PPC::Predicate CompOpc;
18143 switch (LHS.getConstantOperandVal(i: 1)) {
18144 default: // Can't happen, don't crash on invalid number though.
18145 case 0: // Branch on the value of the EQ bit of CR6.
18146 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
18147 break;
18148 case 1: // Branch on the inverted value of the EQ bit of CR6.
18149 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
18150 break;
18151 case 2: // Branch on the value of the LT bit of CR6.
18152 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
18153 break;
18154 case 3: // Branch on the inverted value of the LT bit of CR6.
18155 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
18156 break;
18157 }
18158
18159 return DAG.getNode(Opcode: PPCISD::COND_BRANCH, DL: dl, VT: MVT::Other, N1: N->getOperand(Num: 0),
18160 N2: DAG.getConstant(Val: CompOpc, DL: dl, VT: MVT::i32),
18161 N3: DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32),
18162 N4: N->getOperand(Num: 4), N5: CompNode.getValue(R: 1));
18163 }
18164 break;
18165 }
18166 case ISD::BUILD_VECTOR:
18167 return DAGCombineBuildVector(N, DCI);
18168 case PPCISD::ADDC:
18169 return DAGCombineAddc(N, DCI);
18170 }
18171
18172 return SDValue();
18173}
18174
18175SDValue
18176PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18177 SelectionDAG &DAG,
18178 SmallVectorImpl<SDNode *> &Created) const {
18179 // fold (sdiv X, pow2)
18180 EVT VT = N->getValueType(ResNo: 0);
18181 if (VT == MVT::i64 && !Subtarget.isPPC64())
18182 return SDValue();
18183 if ((VT != MVT::i32 && VT != MVT::i64) ||
18184 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18185 return SDValue();
18186
18187 SDLoc DL(N);
18188 SDValue N0 = N->getOperand(Num: 0);
18189
18190 bool IsNegPow2 = Divisor.isNegatedPowerOf2();
18191 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero();
18192 SDValue ShiftAmt = DAG.getConstant(Val: Lg2, DL, VT);
18193
18194 SDValue Op = DAG.getNode(Opcode: PPCISD::SRA_ADDZE, DL, VT, N1: N0, N2: ShiftAmt);
18195 Created.push_back(Elt: Op.getNode());
18196
18197 if (IsNegPow2) {
18198 Op = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Op);
18199 Created.push_back(Elt: Op.getNode());
18200 }
18201
18202 return Op;
18203}
18204
18205//===----------------------------------------------------------------------===//
18206// Inline Assembly Support
18207//===----------------------------------------------------------------------===//
18208
18209void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18210 KnownBits &Known,
18211 const APInt &DemandedElts,
18212 const SelectionDAG &DAG,
18213 unsigned Depth) const {
18214 Known.resetAll();
18215 switch (Op.getOpcode()) {
18216 default: break;
18217 case PPCISD::LBRX: {
18218 // lhbrx is known to have the top bits cleared out.
18219 if (cast<VTSDNode>(Val: Op.getOperand(i: 2))->getVT() == MVT::i16)
18220 Known.Zero = 0xFFFF0000;
18221 break;
18222 }
18223 case PPCISD::ADDE: {
18224 if (Op.getResNo() == 0) {
18225 // (0|1), _ = ADDE 0, 0, CARRY
18226 SDValue LHS = Op.getOperand(i: 0);
18227 SDValue RHS = Op.getOperand(i: 1);
18228 if (isNullConstant(V: LHS) && isNullConstant(V: RHS))
18229 Known.Zero = ~1ULL;
18230 }
18231 break;
18232 }
18233 case ISD::INTRINSIC_WO_CHAIN: {
18234 switch (Op.getConstantOperandVal(i: 0)) {
18235 default: break;
18236 case Intrinsic::ppc_altivec_vcmpbfp_p:
18237 case Intrinsic::ppc_altivec_vcmpeqfp_p:
18238 case Intrinsic::ppc_altivec_vcmpequb_p:
18239 case Intrinsic::ppc_altivec_vcmpequh_p:
18240 case Intrinsic::ppc_altivec_vcmpequw_p:
18241 case Intrinsic::ppc_altivec_vcmpequd_p:
18242 case Intrinsic::ppc_altivec_vcmpequq_p:
18243 case Intrinsic::ppc_altivec_vcmpgefp_p:
18244 case Intrinsic::ppc_altivec_vcmpgtfp_p:
18245 case Intrinsic::ppc_altivec_vcmpgtsb_p:
18246 case Intrinsic::ppc_altivec_vcmpgtsh_p:
18247 case Intrinsic::ppc_altivec_vcmpgtsw_p:
18248 case Intrinsic::ppc_altivec_vcmpgtsd_p:
18249 case Intrinsic::ppc_altivec_vcmpgtsq_p:
18250 case Intrinsic::ppc_altivec_vcmpgtub_p:
18251 case Intrinsic::ppc_altivec_vcmpgtuh_p:
18252 case Intrinsic::ppc_altivec_vcmpgtuw_p:
18253 case Intrinsic::ppc_altivec_vcmpgtud_p:
18254 case Intrinsic::ppc_altivec_vcmpgtuq_p:
18255 Known.Zero = ~1U; // All bits but the low one are known to be zero.
18256 break;
18257 }
18258 break;
18259 }
18260 case ISD::INTRINSIC_W_CHAIN: {
18261 switch (Op.getConstantOperandVal(i: 1)) {
18262 default:
18263 break;
18264 case Intrinsic::ppc_load2r:
18265 // Top bits are cleared for load2r (which is the same as lhbrx).
18266 Known.Zero = 0xFFFF0000;
18267 break;
18268 }
18269 break;
18270 }
18271 }
18272}
18273
18274Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
18275 switch (Subtarget.getCPUDirective()) {
18276 default: break;
18277 case PPC::DIR_970:
18278 case PPC::DIR_PWR4:
18279 case PPC::DIR_PWR5:
18280 case PPC::DIR_PWR5X:
18281 case PPC::DIR_PWR6:
18282 case PPC::DIR_PWR6X:
18283 case PPC::DIR_PWR7:
18284 case PPC::DIR_PWR8:
18285 case PPC::DIR_PWR9:
18286 case PPC::DIR_PWR10:
18287 case PPC::DIR_PWR11:
18288 case PPC::DIR_PWR_FUTURE: {
18289 if (!ML)
18290 break;
18291
18292 if (!DisableInnermostLoopAlign32) {
18293 // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
18294 // so that we can decrease cache misses and branch-prediction misses.
18295 // Actual alignment of the loop will depend on the hotness check and other
18296 // logic in alignBlocks.
18297 if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
18298 return Align(32);
18299 }
18300
18301 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
18302
18303 // For small loops (between 5 and 8 instructions), align to a 32-byte
18304 // boundary so that the entire loop fits in one instruction-cache line.
18305 uint64_t LoopSize = 0;
18306 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
18307 for (const MachineInstr &J : **I) {
18308 LoopSize += TII->getInstSizeInBytes(MI: J);
18309 if (LoopSize > 32)
18310 break;
18311 }
18312
18313 if (LoopSize > 16 && LoopSize <= 32)
18314 return Align(32);
18315
18316 break;
18317 }
18318 }
18319
18320 return TargetLowering::getPrefLoopAlignment(ML);
18321}
18322
18323/// getConstraintType - Given a constraint, return the type of
18324/// constraint it is for this target.
18325PPCTargetLowering::ConstraintType
18326PPCTargetLowering::getConstraintType(StringRef Constraint) const {
18327 if (Constraint.size() == 1) {
18328 switch (Constraint[0]) {
18329 default: break;
18330 case 'b':
18331 case 'r':
18332 case 'f':
18333 case 'd':
18334 case 'v':
18335 case 'y':
18336 return C_RegisterClass;
18337 case 'Z':
18338 // FIXME: While Z does indicate a memory constraint, it specifically
18339 // indicates an r+r address (used in conjunction with the 'y' modifier
18340 // in the replacement string). Currently, we're forcing the base
18341 // register to be r0 in the asm printer (which is interpreted as zero)
18342 // and forming the complete address in the second register. This is
18343 // suboptimal.
18344 return C_Memory;
18345 }
18346 } else if (Constraint == "wc") { // individual CR bits.
18347 return C_RegisterClass;
18348 } else if (Constraint == "wa" || Constraint == "wd" ||
18349 Constraint == "wf" || Constraint == "ws" ||
18350 Constraint == "wi" || Constraint == "ww") {
18351 return C_RegisterClass; // VSX registers.
18352 }
18353 return TargetLowering::getConstraintType(Constraint);
18354}
18355
18356/// Examine constraint type and operand type and determine a weight value.
18357/// This object must already have been set up with the operand type
18358/// and the current alternative constraint selected.
18359TargetLowering::ConstraintWeight
18360PPCTargetLowering::getSingleConstraintMatchWeight(
18361 AsmOperandInfo &info, const char *constraint) const {
18362 ConstraintWeight weight = CW_Invalid;
18363 Value *CallOperandVal = info.CallOperandVal;
18364 // If we don't have a value, we can't do a match,
18365 // but allow it at the lowest weight.
18366 if (!CallOperandVal)
18367 return CW_Default;
18368 Type *type = CallOperandVal->getType();
18369
18370 // Look at the constraint type.
18371 if (StringRef(constraint) == "wc" && type->isIntegerTy(Bitwidth: 1))
18372 return CW_Register; // an individual CR bit.
18373 else if ((StringRef(constraint) == "wa" ||
18374 StringRef(constraint) == "wd" ||
18375 StringRef(constraint) == "wf") &&
18376 type->isVectorTy())
18377 return CW_Register;
18378 else if (StringRef(constraint) == "wi" && type->isIntegerTy(Bitwidth: 64))
18379 return CW_Register; // just hold 64-bit integers data.
18380 else if (StringRef(constraint) == "ws" && type->isDoubleTy())
18381 return CW_Register;
18382 else if (StringRef(constraint) == "ww" && type->isFloatTy())
18383 return CW_Register;
18384
18385 switch (*constraint) {
18386 default:
18387 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
18388 break;
18389 case 'b':
18390 if (type->isIntegerTy())
18391 weight = CW_Register;
18392 break;
18393 case 'f':
18394 if (type->isFloatTy())
18395 weight = CW_Register;
18396 break;
18397 case 'd':
18398 if (type->isDoubleTy())
18399 weight = CW_Register;
18400 break;
18401 case 'v':
18402 if (type->isVectorTy())
18403 weight = CW_Register;
18404 break;
18405 case 'y':
18406 weight = CW_Register;
18407 break;
18408 case 'Z':
18409 weight = CW_Memory;
18410 break;
18411 }
18412 return weight;
18413}
18414
18415std::pair<unsigned, const TargetRegisterClass *>
18416PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
18417 StringRef Constraint,
18418 MVT VT) const {
18419 if (Constraint.size() == 1) {
18420 // GCC RS6000 Constraint Letters
18421 switch (Constraint[0]) {
18422 case 'b': // R1-R31
18423 if (VT == MVT::i64 && Subtarget.isPPC64())
18424 return std::make_pair(x: 0U, y: &PPC::G8RC_NOX0RegClass);
18425 return std::make_pair(x: 0U, y: &PPC::GPRC_NOR0RegClass);
18426 case 'r': // R0-R31
18427 if (VT == MVT::i64 && Subtarget.isPPC64())
18428 return std::make_pair(x: 0U, y: &PPC::G8RCRegClass);
18429 return std::make_pair(x: 0U, y: &PPC::GPRCRegClass);
18430 // 'd' and 'f' constraints are both defined to be "the floating point
18431 // registers", where one is for 32-bit and the other for 64-bit. We don't
18432 // really care overly much here so just give them all the same reg classes.
18433 case 'd':
18434 case 'f':
18435 if (Subtarget.hasSPE()) {
18436 if (VT == MVT::f32 || VT == MVT::i32)
18437 return std::make_pair(x: 0U, y: &PPC::GPRCRegClass);
18438 if (VT == MVT::f64 || VT == MVT::i64)
18439 return std::make_pair(x: 0U, y: &PPC::SPERCRegClass);
18440 } else {
18441 if (VT == MVT::f32 || VT == MVT::i32)
18442 return std::make_pair(x: 0U, y: &PPC::F4RCRegClass);
18443 if (VT == MVT::f64 || VT == MVT::i64)
18444 return std::make_pair(x: 0U, y: &PPC::F8RCRegClass);
18445 }
18446 break;
18447 case 'v':
18448 if (Subtarget.hasAltivec() && VT.isVector())
18449 return std::make_pair(x: 0U, y: &PPC::VRRCRegClass);
18450 else if (Subtarget.hasVSX())
18451 // Scalars in Altivec registers only make sense with VSX.
18452 return std::make_pair(x: 0U, y: &PPC::VFRCRegClass);
18453 break;
18454 case 'y': // crrc
18455 return std::make_pair(x: 0U, y: &PPC::CRRCRegClass);
18456 }
18457 } else if (Constraint == "wc" && Subtarget.useCRBits()) {
18458 // An individual CR bit.
18459 return std::make_pair(x: 0U, y: &PPC::CRBITRCRegClass);
18460 } else if ((Constraint == "wa" || Constraint == "wd" ||
18461 Constraint == "wf" || Constraint == "wi") &&
18462 Subtarget.hasVSX()) {
18463 // A VSX register for either a scalar (FP) or vector. There is no
18464 // support for single precision scalars on subtargets prior to Power8.
18465 if (VT.isVector())
18466 return std::make_pair(x: 0U, y: &PPC::VSRCRegClass);
18467 if (VT == MVT::f32 && Subtarget.hasP8Vector())
18468 return std::make_pair(x: 0U, y: &PPC::VSSRCRegClass);
18469 return std::make_pair(x: 0U, y: &PPC::VSFRCRegClass);
18470 } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
18471 if (VT == MVT::f32 && Subtarget.hasP8Vector())
18472 return std::make_pair(x: 0U, y: &PPC::VSSRCRegClass);
18473 else
18474 return std::make_pair(x: 0U, y: &PPC::VSFRCRegClass);
18475 } else if (Constraint == "lr") {
18476 if (VT == MVT::i64)
18477 return std::make_pair(x: 0U, y: &PPC::LR8RCRegClass);
18478 else
18479 return std::make_pair(x: 0U, y: &PPC::LRRCRegClass);
18480 }
18481
18482 // Handle special cases of physical registers that are not properly handled
18483 // by the base class.
18484 if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
18485 // If we name a VSX register, we can't defer to the base class because it
18486 // will not recognize the correct register (their names will be VSL{0-31}
18487 // and V{0-31} so they won't match). So we match them here.
18488 if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
18489 int VSNum = atoi(nptr: Constraint.data() + 3);
18490 assert(VSNum >= 0 && VSNum <= 63 &&
18491 "Attempted to access a vsr out of range");
18492 if (VSNum < 32)
18493 return std::make_pair(x: PPC::VSL0 + VSNum, y: &PPC::VSRCRegClass);
18494 return std::make_pair(x: PPC::V0 + VSNum - 32, y: &PPC::VSRCRegClass);
18495 }
18496
18497 // For float registers, we can't defer to the base class as it will match
18498 // the SPILLTOVSRRC class.
18499 if (Constraint.size() > 3 && Constraint[1] == 'f') {
18500 int RegNum = atoi(nptr: Constraint.data() + 2);
18501 if (RegNum > 31 || RegNum < 0)
18502 report_fatal_error(reason: "Invalid floating point register number");
18503 if (VT == MVT::f32 || VT == MVT::i32)
18504 return Subtarget.hasSPE()
18505 ? std::make_pair(x: PPC::R0 + RegNum, y: &PPC::GPRCRegClass)
18506 : std::make_pair(x: PPC::F0 + RegNum, y: &PPC::F4RCRegClass);
18507 if (VT == MVT::f64 || VT == MVT::i64)
18508 return Subtarget.hasSPE()
18509 ? std::make_pair(x: PPC::S0 + RegNum, y: &PPC::SPERCRegClass)
18510 : std::make_pair(x: PPC::F0 + RegNum, y: &PPC::F8RCRegClass);
18511 }
18512 }
18513
18514 std::pair<unsigned, const TargetRegisterClass *> R =
18515 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18516
18517 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
18518 // (which we call X[0-9]+). If a 64-bit value has been requested, and a
18519 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
18520 // register.
18521 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
18522 // the AsmName field from *RegisterInfo.td, then this would not be necessary.
18523 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
18524 PPC::GPRCRegClass.contains(Reg: R.first))
18525 return std::make_pair(x: TRI->getMatchingSuperReg(Reg: R.first,
18526 SubIdx: PPC::sub_32, RC: &PPC::G8RCRegClass),
18527 y: &PPC::G8RCRegClass);
18528
18529 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
18530 if (!R.second && StringRef("{cc}").equals_insensitive(RHS: Constraint)) {
18531 R.first = PPC::CR0;
18532 R.second = &PPC::CRRCRegClass;
18533 }
18534 // FIXME: This warning should ideally be emitted in the front end.
18535 const auto &TM = getTargetMachine();
18536 if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {
18537 if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||
18538 (R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&
18539 (R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))
18540 errs() << "warning: vector registers 20 to 32 are reserved in the "
18541 "default AIX AltiVec ABI and cannot be used\n";
18542 }
18543
18544 return R;
18545}
18546
18547/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
18548/// vector. If it is invalid, don't add anything to Ops.
18549void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18550 StringRef Constraint,
18551 std::vector<SDValue> &Ops,
18552 SelectionDAG &DAG) const {
18553 SDValue Result;
18554
18555 // Only support length 1 constraints.
18556 if (Constraint.size() > 1)
18557 return;
18558
18559 char Letter = Constraint[0];
18560 switch (Letter) {
18561 default: break;
18562 case 'I':
18563 case 'J':
18564 case 'K':
18565 case 'L':
18566 case 'M':
18567 case 'N':
18568 case 'O':
18569 case 'P': {
18570 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Val&: Op);
18571 if (!CST) return; // Must be an immediate to match.
18572 SDLoc dl(Op);
18573 int64_t Value = CST->getSExtValue();
18574 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
18575 // numbers are printed as such.
18576 switch (Letter) {
18577 default: llvm_unreachable("Unknown constraint letter!");
18578 case 'I': // "I" is a signed 16-bit constant.
18579 if (isInt<16>(x: Value))
18580 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18581 break;
18582 case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
18583 if (isShiftedUInt<16, 16>(x: Value))
18584 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18585 break;
18586 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
18587 if (isShiftedInt<16, 16>(x: Value))
18588 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18589 break;
18590 case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
18591 if (isUInt<16>(x: Value))
18592 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18593 break;
18594 case 'M': // "M" is a constant that is greater than 31.
18595 if (Value > 31)
18596 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18597 break;
18598 case 'N': // "N" is a positive constant that is an exact power of two.
18599 if (Value > 0 && isPowerOf2_64(Value))
18600 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18601 break;
18602 case 'O': // "O" is the constant zero.
18603 if (Value == 0)
18604 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18605 break;
18606 case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
18607 if (isInt<16>(x: -Value))
18608 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18609 break;
18610 }
18611 break;
18612 }
18613 }
18614
18615 if (Result.getNode()) {
18616 Ops.push_back(x: Result);
18617 return;
18618 }
18619
18620 // Handle standard constraint letters.
18621 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18622}
18623
18624void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,
18625 SmallVectorImpl<SDValue> &Ops,
18626 SelectionDAG &DAG) const {
18627 if (I.getNumOperands() <= 1)
18628 return;
18629 if (!isa<ConstantSDNode>(Val: Ops[1].getNode()))
18630 return;
18631 auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();
18632 if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&
18633 IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)
18634 return;
18635
18636 if (MDNode *MDN = I.getMetadata(KindID: LLVMContext::MD_annotation))
18637 Ops.push_back(Elt: DAG.getMDNode(MD: MDN));
18638}
18639
18640// isLegalAddressingMode - Return true if the addressing mode represented
18641// by AM is legal for this target, for a load/store of the specified type.
18642bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
18643 const AddrMode &AM, Type *Ty,
18644 unsigned AS,
18645 Instruction *I) const {
18646 // Vector type r+i form is supported since power9 as DQ form. We don't check
18647 // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
18648 // imm form is preferred and the offset can be adjusted to use imm form later
18649 // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
18650 // max offset to check legal addressing mode, we should be a little aggressive
18651 // to contain other offsets for that LSRUse.
18652 if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
18653 return false;
18654
18655 // PPC allows a sign-extended 16-bit immediate field.
18656 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
18657 return false;
18658
18659 // No global is ever allowed as a base.
18660 if (AM.BaseGV)
18661 return false;
18662
18663 // PPC only support r+r,
18664 switch (AM.Scale) {
18665 case 0: // "r+i" or just "i", depending on HasBaseReg.
18666 break;
18667 case 1:
18668 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
18669 return false;
18670 // Otherwise we have r+r or r+i.
18671 break;
18672 case 2:
18673 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
18674 return false;
18675 // Allow 2*r as r+r.
18676 break;
18677 default:
18678 // No other scales are supported.
18679 return false;
18680 }
18681
18682 return true;
18683}
18684
18685SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
18686 SelectionDAG &DAG) const {
18687 MachineFunction &MF = DAG.getMachineFunction();
18688 MachineFrameInfo &MFI = MF.getFrameInfo();
18689 MFI.setReturnAddressIsTaken(true);
18690
18691 SDLoc dl(Op);
18692 unsigned Depth = Op.getConstantOperandVal(i: 0);
18693
18694 // Make sure the function does not optimize away the store of the RA to
18695 // the stack.
18696 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
18697 FuncInfo->setLRStoreRequired();
18698 auto PtrVT = getPointerTy(DL: MF.getDataLayout());
18699
18700 if (Depth > 0) {
18701 // The link register (return address) is saved in the caller's frame
18702 // not the callee's stack frame. So we must get the caller's frame
18703 // address and load the return address at the LR offset from there.
18704 SDValue FrameAddr =
18705 DAG.getLoad(VT: Op.getValueType(), dl, Chain: DAG.getEntryNode(),
18706 Ptr: LowerFRAMEADDR(Op, DAG), PtrInfo: MachinePointerInfo());
18707 SDValue Offset =
18708 DAG.getConstant(Val: Subtarget.getFrameLowering()->getReturnSaveOffset(), DL: dl,
18709 VT: Subtarget.getScalarIntVT());
18710 return DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(),
18711 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FrameAddr, N2: Offset),
18712 PtrInfo: MachinePointerInfo());
18713 }
18714
18715 // Just load the return address off the stack.
18716 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
18717 return DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: RetAddrFI,
18718 PtrInfo: MachinePointerInfo());
18719}
18720
18721SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
18722 SelectionDAG &DAG) const {
18723 SDLoc dl(Op);
18724 unsigned Depth = Op.getConstantOperandVal(i: 0);
18725
18726 MachineFunction &MF = DAG.getMachineFunction();
18727 MachineFrameInfo &MFI = MF.getFrameInfo();
18728 MFI.setFrameAddressIsTaken(true);
18729
18730 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
18731 bool isPPC64 = PtrVT == MVT::i64;
18732
18733 // Naked functions never have a frame pointer, and so we use r1. For all
18734 // other functions, this decision must be delayed until during PEI.
18735 unsigned FrameReg;
18736 if (MF.getFunction().hasFnAttribute(Kind: Attribute::Naked))
18737 FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
18738 else
18739 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
18740
18741 SDValue FrameAddr = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg: FrameReg,
18742 VT: PtrVT);
18743 while (Depth--)
18744 FrameAddr = DAG.getLoad(VT: Op.getValueType(), dl, Chain: DAG.getEntryNode(),
18745 Ptr: FrameAddr, PtrInfo: MachinePointerInfo());
18746 return FrameAddr;
18747}
18748
18749#define GET_REGISTER_MATCHER
18750#include "PPCGenAsmMatcher.inc"
18751
18752Register PPCTargetLowering::getRegisterByName(const char *RegName, LLT VT,
18753 const MachineFunction &MF) const {
18754 bool IsPPC64 = Subtarget.isPPC64();
18755
18756 bool Is64Bit = IsPPC64 && VT == LLT::scalar(SizeInBits: 64);
18757 if (!Is64Bit && VT != LLT::scalar(SizeInBits: 32))
18758 report_fatal_error(reason: "Invalid register global variable type");
18759
18760 Register Reg = MatchRegisterName(Name: RegName);
18761 if (!Reg)
18762 return Reg;
18763
18764 // FIXME: Unable to generate code for `-O2` but okay for `-O0`.
18765 // Need followup investigation as to why.
18766 if ((IsPPC64 && Reg == PPC::R2) || Reg == PPC::R0)
18767 report_fatal_error(reason: Twine("Trying to reserve an invalid register \"" +
18768 StringRef(RegName) + "\"."));
18769
18770 // Convert GPR to GP8R register for 64bit.
18771 if (Is64Bit && StringRef(RegName).starts_with_insensitive(Prefix: "r"))
18772 Reg = Reg.id() - PPC::R0 + PPC::X0;
18773
18774 return Reg;
18775}
18776
18777bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
18778 // 32-bit SVR4 ABI access everything as got-indirect.
18779 if (Subtarget.is32BitELFABI())
18780 return true;
18781
18782 // AIX accesses everything indirectly through the TOC, which is similar to
18783 // the GOT.
18784 if (Subtarget.isAIXABI())
18785 return true;
18786
18787 CodeModel::Model CModel = getTargetMachine().getCodeModel();
18788 // If it is small or large code model, module locals are accessed
18789 // indirectly by loading their address from .toc/.got.
18790 if (CModel == CodeModel::Small || CModel == CodeModel::Large)
18791 return true;
18792
18793 // JumpTable and BlockAddress are accessed as got-indirect.
18794 if (isa<JumpTableSDNode>(Val: GA) || isa<BlockAddressSDNode>(Val: GA))
18795 return true;
18796
18797 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: GA))
18798 return Subtarget.isGVIndirectSymbol(GV: G->getGlobal());
18799
18800 return false;
18801}
18802
18803bool
18804PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
18805 // The PowerPC target isn't yet aware of offsets.
18806 return false;
18807}
18808
18809void PPCTargetLowering::getTgtMemIntrinsic(
18810 SmallVectorImpl<IntrinsicInfo> &Infos, const CallBase &I,
18811 MachineFunction &MF, unsigned Intrinsic) const {
18812 IntrinsicInfo Info;
18813 switch (Intrinsic) {
18814 case Intrinsic::ppc_atomicrmw_xchg_i128:
18815 case Intrinsic::ppc_atomicrmw_add_i128:
18816 case Intrinsic::ppc_atomicrmw_sub_i128:
18817 case Intrinsic::ppc_atomicrmw_nand_i128:
18818 case Intrinsic::ppc_atomicrmw_and_i128:
18819 case Intrinsic::ppc_atomicrmw_or_i128:
18820 case Intrinsic::ppc_atomicrmw_xor_i128:
18821 case Intrinsic::ppc_cmpxchg_i128:
18822 Info.opc = ISD::INTRINSIC_W_CHAIN;
18823 Info.memVT = MVT::i128;
18824 Info.ptrVal = I.getArgOperand(i: 0);
18825 Info.offset = 0;
18826 Info.align = Align(16);
18827 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
18828 MachineMemOperand::MOVolatile;
18829 Infos.push_back(Elt: Info);
18830 return;
18831 case Intrinsic::ppc_atomic_load_i128:
18832 Info.opc = ISD::INTRINSIC_W_CHAIN;
18833 Info.memVT = MVT::i128;
18834 Info.ptrVal = I.getArgOperand(i: 0);
18835 Info.offset = 0;
18836 Info.align = Align(16);
18837 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
18838 Infos.push_back(Elt: Info);
18839 return;
18840 case Intrinsic::ppc_atomic_store_i128:
18841 Info.opc = ISD::INTRINSIC_VOID;
18842 Info.memVT = MVT::i128;
18843 Info.ptrVal = I.getArgOperand(i: 2);
18844 Info.offset = 0;
18845 Info.align = Align(16);
18846 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
18847 Infos.push_back(Elt: Info);
18848 return;
18849 case Intrinsic::ppc_altivec_lvx:
18850 case Intrinsic::ppc_altivec_lvxl:
18851 case Intrinsic::ppc_altivec_lvebx:
18852 case Intrinsic::ppc_altivec_lvehx:
18853 case Intrinsic::ppc_altivec_lvewx:
18854 case Intrinsic::ppc_vsx_lxvd2x:
18855 case Intrinsic::ppc_vsx_lxvw4x:
18856 case Intrinsic::ppc_vsx_lxvd2x_be:
18857 case Intrinsic::ppc_vsx_lxvw4x_be:
18858 case Intrinsic::ppc_vsx_lxvl:
18859 case Intrinsic::ppc_vsx_lxvll: {
18860 EVT VT;
18861 switch (Intrinsic) {
18862 case Intrinsic::ppc_altivec_lvebx:
18863 VT = MVT::i8;
18864 break;
18865 case Intrinsic::ppc_altivec_lvehx:
18866 VT = MVT::i16;
18867 break;
18868 case Intrinsic::ppc_altivec_lvewx:
18869 VT = MVT::i32;
18870 break;
18871 case Intrinsic::ppc_vsx_lxvd2x:
18872 case Intrinsic::ppc_vsx_lxvd2x_be:
18873 VT = MVT::v2f64;
18874 break;
18875 default:
18876 VT = MVT::v4i32;
18877 break;
18878 }
18879
18880 Info.opc = ISD::INTRINSIC_W_CHAIN;
18881 Info.memVT = VT;
18882 Info.ptrVal = I.getArgOperand(i: 0);
18883 Info.offset = -VT.getStoreSize()+1;
18884 Info.size = 2*VT.getStoreSize()-1;
18885 Info.align = Align(1);
18886 Info.flags = MachineMemOperand::MOLoad;
18887 Infos.push_back(Elt: Info);
18888 return;
18889 }
18890 case Intrinsic::ppc_altivec_stvx:
18891 case Intrinsic::ppc_altivec_stvxl:
18892 case Intrinsic::ppc_altivec_stvebx:
18893 case Intrinsic::ppc_altivec_stvehx:
18894 case Intrinsic::ppc_altivec_stvewx:
18895 case Intrinsic::ppc_vsx_stxvd2x:
18896 case Intrinsic::ppc_vsx_stxvw4x:
18897 case Intrinsic::ppc_vsx_stxvd2x_be:
18898 case Intrinsic::ppc_vsx_stxvw4x_be:
18899 case Intrinsic::ppc_vsx_stxvl:
18900 case Intrinsic::ppc_vsx_stxvll: {
18901 EVT VT;
18902 switch (Intrinsic) {
18903 case Intrinsic::ppc_altivec_stvebx:
18904 VT = MVT::i8;
18905 break;
18906 case Intrinsic::ppc_altivec_stvehx:
18907 VT = MVT::i16;
18908 break;
18909 case Intrinsic::ppc_altivec_stvewx:
18910 VT = MVT::i32;
18911 break;
18912 case Intrinsic::ppc_vsx_stxvd2x:
18913 case Intrinsic::ppc_vsx_stxvd2x_be:
18914 VT = MVT::v2f64;
18915 break;
18916 default:
18917 VT = MVT::v4i32;
18918 break;
18919 }
18920
18921 Info.opc = ISD::INTRINSIC_VOID;
18922 Info.memVT = VT;
18923 Info.ptrVal = I.getArgOperand(i: 1);
18924 Info.offset = -VT.getStoreSize()+1;
18925 Info.size = 2*VT.getStoreSize()-1;
18926 Info.align = Align(1);
18927 Info.flags = MachineMemOperand::MOStore;
18928 Infos.push_back(Elt: Info);
18929 return;
18930 }
18931 case Intrinsic::ppc_stdcx:
18932 case Intrinsic::ppc_stwcx:
18933 case Intrinsic::ppc_sthcx:
18934 case Intrinsic::ppc_stbcx: {
18935 EVT VT;
18936 auto Alignment = Align(8);
18937 switch (Intrinsic) {
18938 case Intrinsic::ppc_stdcx:
18939 VT = MVT::i64;
18940 break;
18941 case Intrinsic::ppc_stwcx:
18942 VT = MVT::i32;
18943 Alignment = Align(4);
18944 break;
18945 case Intrinsic::ppc_sthcx:
18946 VT = MVT::i16;
18947 Alignment = Align(2);
18948 break;
18949 case Intrinsic::ppc_stbcx:
18950 VT = MVT::i8;
18951 Alignment = Align(1);
18952 break;
18953 }
18954 Info.opc = ISD::INTRINSIC_W_CHAIN;
18955 Info.memVT = VT;
18956 Info.ptrVal = I.getArgOperand(i: 0);
18957 Info.offset = 0;
18958 Info.align = Alignment;
18959 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
18960 Infos.push_back(Elt: Info);
18961 return;
18962 }
18963 default:
18964 break;
18965 }
18966}
18967
18968/// It returns EVT::Other if the type should be determined using generic
18969/// target-independent logic.
18970EVT PPCTargetLowering::getOptimalMemOpType(
18971 LLVMContext &Context, const MemOp &Op,
18972 const AttributeList &FuncAttributes) const {
18973 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) {
18974 // We should use Altivec/VSX loads and stores when available. For unaligned
18975 // addresses, unaligned VSX loads are only fast starting with the P8.
18976 if (Subtarget.hasAltivec() && Op.size() >= 16) {
18977 if (Op.isMemset() && Subtarget.hasVSX()) {
18978 uint64_t TailSize = Op.size() % 16;
18979 // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
18980 // element if vector element type matches tail store. For tail size
18981 // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
18982 if (TailSize > 2 && TailSize <= 4) {
18983 return MVT::v8i16;
18984 }
18985 return MVT::v4i32;
18986 }
18987 if (Op.isAligned(AlignCheck: Align(16)) || Subtarget.hasP8Vector())
18988 return MVT::v4i32;
18989 }
18990 }
18991
18992 if (Subtarget.isPPC64()) {
18993 return MVT::i64;
18994 }
18995
18996 return MVT::i32;
18997}
18998
18999/// Returns true if it is beneficial to convert a load of a constant
19000/// to just the constant itself.
19001bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
19002 Type *Ty) const {
19003 assert(Ty->isIntegerTy());
19004
19005 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19006 return !(BitSize == 0 || BitSize > 64);
19007}
19008
19009bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
19010 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19011 return false;
19012 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
19013 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
19014 return NumBits1 == 64 && NumBits2 == 32;
19015}
19016
19017bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
19018 if (!VT1.isInteger() || !VT2.isInteger())
19019 return false;
19020 unsigned NumBits1 = VT1.getSizeInBits();
19021 unsigned NumBits2 = VT2.getSizeInBits();
19022 return NumBits1 == 64 && NumBits2 == 32;
19023}
19024
19025bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
19026 // Generally speaking, zexts are not free, but they are free when they can be
19027 // folded with other operations.
19028 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
19029 EVT MemVT = LD->getMemoryVT();
19030 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
19031 (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
19032 (LD->getExtensionType() == ISD::NON_EXTLOAD ||
19033 LD->getExtensionType() == ISD::ZEXTLOAD))
19034 return true;
19035 }
19036
19037 // FIXME: Add other cases...
19038 // - 32-bit shifts with a zext to i64
19039 // - zext after ctlz, bswap, etc.
19040 // - zext after and by a constant mask
19041
19042 return TargetLowering::isZExtFree(Val, VT2);
19043}
19044
19045bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
19046 assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
19047 "invalid fpext types");
19048 // Extending to float128 is not free.
19049 if (DestVT == MVT::f128)
19050 return false;
19051 return true;
19052}
19053
19054bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
19055 return isInt<16>(x: Imm) || isUInt<16>(x: Imm);
19056}
19057
19058bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
19059 return isInt<16>(x: Imm) || isUInt<16>(x: Imm);
19060}
19061
19062bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
19063 MachineMemOperand::Flags,
19064 unsigned *Fast) const {
19065 if (DisablePPCUnaligned)
19066 return false;
19067
19068 // PowerPC supports unaligned memory access for simple non-vector types.
19069 // Although accessing unaligned addresses is not as efficient as accessing
19070 // aligned addresses, it is generally more efficient than manual expansion,
19071 // and generally only traps for software emulation when crossing page
19072 // boundaries.
19073
19074 if (!VT.isSimple())
19075 return false;
19076
19077 if (VT.isFloatingPoint() && !VT.isVector() &&
19078 !Subtarget.allowsUnalignedFPAccess())
19079 return false;
19080
19081 if (VT.getSimpleVT().isVector()) {
19082 if (Subtarget.hasVSX()) {
19083 if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
19084 VT != MVT::v4f32 && VT != MVT::v4i32)
19085 return false;
19086 } else {
19087 return false;
19088 }
19089 }
19090
19091 if (VT == MVT::ppcf128)
19092 return false;
19093
19094 if (Fast)
19095 *Fast = 1;
19096
19097 return true;
19098}
19099
19100bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
19101 SDValue C) const {
19102 // Check integral scalar types.
19103 if (!VT.isScalarInteger())
19104 return false;
19105 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val: C.getNode())) {
19106 if (!ConstNode->getAPIntValue().isSignedIntN(N: 64))
19107 return false;
19108 // This transformation will generate >= 2 operations. But the following
19109 // cases will generate <= 2 instructions during ISEL. So exclude them.
19110 // 1. If the constant multiplier fits 16 bits, it can be handled by one
19111 // HW instruction, ie. MULLI
19112 // 2. If the multiplier after shifted fits 16 bits, an extra shift
19113 // instruction is needed than case 1, ie. MULLI and RLDICR
19114 int64_t Imm = ConstNode->getSExtValue();
19115 unsigned Shift = llvm::countr_zero<uint64_t>(Val: Imm);
19116 Imm >>= Shift;
19117 if (isInt<16>(x: Imm))
19118 return false;
19119 uint64_t UImm = static_cast<uint64_t>(Imm);
19120 if (isPowerOf2_64(Value: UImm + 1) || isPowerOf2_64(Value: UImm - 1) ||
19121 isPowerOf2_64(Value: 1 - UImm) || isPowerOf2_64(Value: -1 - UImm))
19122 return true;
19123 }
19124 return false;
19125}
19126
19127bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19128 EVT VT) const {
19129 return isFMAFasterThanFMulAndFAdd(
19130 F: MF.getFunction(), Ty: VT.getTypeForEVT(Context&: MF.getFunction().getContext()));
19131}
19132
19133bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
19134 Type *Ty) const {
19135 if (Subtarget.hasSPE() || Subtarget.useSoftFloat())
19136 return false;
19137 switch (Ty->getScalarType()->getTypeID()) {
19138 case Type::FloatTyID:
19139 case Type::DoubleTyID:
19140 return true;
19141 case Type::FP128TyID:
19142 return Subtarget.hasP9Vector();
19143 default:
19144 return false;
19145 }
19146}
19147
19148// FIXME: add more patterns which are not profitable to hoist.
19149bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
19150 if (!I->hasOneUse())
19151 return true;
19152
19153 Instruction *User = I->user_back();
19154 assert(User && "A single use instruction with no uses.");
19155
19156 switch (I->getOpcode()) {
19157 case Instruction::FMul: {
19158 // Don't break FMA, PowerPC prefers FMA.
19159 if (User->getOpcode() != Instruction::FSub &&
19160 User->getOpcode() != Instruction::FAdd)
19161 return true;
19162
19163 const TargetOptions &Options = getTargetMachine().Options;
19164 const Function *F = I->getFunction();
19165 const DataLayout &DL = F->getDataLayout();
19166 Type *Ty = User->getOperand(i: 0)->getType();
19167 bool AllowContract = I->getFastMathFlags().allowContract() &&
19168 User->getFastMathFlags().allowContract();
19169
19170 return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
19171 isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
19172 (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
19173 }
19174 case Instruction::Load: {
19175 // Don't break "store (load float*)" pattern, this pattern will be combined
19176 // to "store (load int32)" in later InstCombine pass. See function
19177 // combineLoadToOperationType. On PowerPC, loading a float point takes more
19178 // cycles than loading a 32 bit integer.
19179 LoadInst *LI = cast<LoadInst>(Val: I);
19180 // For the loads that combineLoadToOperationType does nothing, like
19181 // ordered load, it should be profitable to hoist them.
19182 // For swifterror load, it can only be used for pointer to pointer type, so
19183 // later type check should get rid of this case.
19184 if (!LI->isUnordered())
19185 return true;
19186
19187 if (User->getOpcode() != Instruction::Store)
19188 return true;
19189
19190 if (I->getType()->getTypeID() != Type::FloatTyID)
19191 return true;
19192
19193 return false;
19194 }
19195 default:
19196 return true;
19197 }
19198 return true;
19199}
19200
19201const MCPhysReg *
19202PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
19203 // LR is a callee-save register, but we must treat it as clobbered by any call
19204 // site. Hence we include LR in the scratch registers, which are in turn added
19205 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
19206 // to CTR, which is used by any indirect call.
19207 static const MCPhysReg ScratchRegs[] = {
19208 PPC::X12, PPC::LR8, PPC::CTR8, 0
19209 };
19210
19211 return ScratchRegs;
19212}
19213
19214Register PPCTargetLowering::getExceptionPointerRegister(
19215 const Constant *PersonalityFn) const {
19216 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
19217}
19218
19219Register PPCTargetLowering::getExceptionSelectorRegister(
19220 const Constant *PersonalityFn) const {
19221 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
19222}
19223
19224bool
19225PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
19226 EVT VT , unsigned DefinedValues) const {
19227 if (VT == MVT::v2i64)
19228 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
19229
19230 if (Subtarget.hasVSX())
19231 return true;
19232
19233 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
19234}
19235
19236Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
19237 if (DisableILPPref || Subtarget.enableMachineScheduler())
19238 return TargetLowering::getSchedulingPreference(N);
19239
19240 return Sched::ILP;
19241}
19242
19243// Create a fast isel object.
19244FastISel *PPCTargetLowering::createFastISel(
19245 FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo,
19246 const LibcallLoweringInfo *LibcallLowering) const {
19247 return PPC::createFastISel(FuncInfo, LibInfo, LibcallLowering);
19248}
19249
19250// 'Inverted' means the FMA opcode after negating one multiplicand.
19251// For example, (fma -a b c) = (fnmsub a b c)
19252static unsigned invertFMAOpcode(unsigned Opc) {
19253 switch (Opc) {
19254 default:
19255 llvm_unreachable("Invalid FMA opcode for PowerPC!");
19256 case ISD::FMA:
19257 return PPCISD::FNMSUB;
19258 case PPCISD::FNMSUB:
19259 return ISD::FMA;
19260 }
19261}
19262
19263SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
19264 bool LegalOps, bool OptForSize,
19265 NegatibleCost &Cost,
19266 unsigned Depth) const {
19267 if (Depth > SelectionDAG::MaxRecursionDepth)
19268 return SDValue();
19269
19270 unsigned Opc = Op.getOpcode();
19271 EVT VT = Op.getValueType();
19272 SDNodeFlags Flags = Op.getNode()->getFlags();
19273
19274 switch (Opc) {
19275 case PPCISD::FNMSUB:
19276 if (!Op.hasOneUse() || !isTypeLegal(VT))
19277 break;
19278
19279 const TargetOptions &Options = getTargetMachine().Options;
19280 SDValue N0 = Op.getOperand(i: 0);
19281 SDValue N1 = Op.getOperand(i: 1);
19282 SDValue N2 = Op.getOperand(i: 2);
19283 SDLoc Loc(Op);
19284
19285 NegatibleCost N2Cost = NegatibleCost::Expensive;
19286 SDValue NegN2 =
19287 getNegatedExpression(Op: N2, DAG, LegalOps, OptForSize, Cost&: N2Cost, Depth: Depth + 1);
19288
19289 if (!NegN2)
19290 return SDValue();
19291
19292 // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
19293 // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
19294 // These transformations may change sign of zeroes. For example,
19295 // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
19296 if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
19297 // Try and choose the cheaper one to negate.
19298 NegatibleCost N0Cost = NegatibleCost::Expensive;
19299 SDValue NegN0 = getNegatedExpression(Op: N0, DAG, LegalOps, OptForSize,
19300 Cost&: N0Cost, Depth: Depth + 1);
19301
19302 NegatibleCost N1Cost = NegatibleCost::Expensive;
19303 SDValue NegN1 = getNegatedExpression(Op: N1, DAG, LegalOps, OptForSize,
19304 Cost&: N1Cost, Depth: Depth + 1);
19305
19306 if (NegN0 && N0Cost <= N1Cost) {
19307 Cost = std::min(a: N0Cost, b: N2Cost);
19308 return DAG.getNode(Opcode: Opc, DL: Loc, VT, N1: NegN0, N2: N1, N3: NegN2, Flags);
19309 } else if (NegN1) {
19310 Cost = std::min(a: N1Cost, b: N2Cost);
19311 return DAG.getNode(Opcode: Opc, DL: Loc, VT, N1: N0, N2: NegN1, N3: NegN2, Flags);
19312 }
19313 }
19314
19315 // (fneg (fnmsub a b c)) => (fma a b (fneg c))
19316 if (isOperationLegal(Op: ISD::FMA, VT)) {
19317 Cost = N2Cost;
19318 return DAG.getNode(Opcode: ISD::FMA, DL: Loc, VT, N1: N0, N2: N1, N3: NegN2, Flags);
19319 }
19320
19321 break;
19322 }
19323
19324 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
19325 Cost, Depth);
19326}
19327
19328// Override to enable LOAD_STACK_GUARD lowering on Linux.
19329bool PPCTargetLowering::useLoadStackGuardNode(const Module &M) const {
19330 if (M.getStackProtectorGuard() == "tls" || Subtarget.isTargetLinux())
19331 return true;
19332 return TargetLowering::useLoadStackGuardNode(M);
19333}
19334
19335bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
19336 bool ForCodeSize) const {
19337 if (!VT.isSimple() || !Subtarget.hasVSX())
19338 return false;
19339
19340 switch(VT.getSimpleVT().SimpleTy) {
19341 default:
19342 // For FP types that are currently not supported by PPC backend, return
19343 // false. Examples: f16, f80.
19344 return false;
19345 case MVT::f32:
19346 case MVT::f64: {
19347 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
19348 // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
19349 return true;
19350 }
19351 bool IsExact;
19352 APSInt IntResult(16, false);
19353 // The rounding mode doesn't really matter because we only care about floats
19354 // that can be converted to integers exactly.
19355 Imm.convertToInteger(Result&: IntResult, RM: APFloat::rmTowardZero, IsExact: &IsExact);
19356 // For exact values in the range [-16, 15] we can materialize the float.
19357 if (IsExact && IntResult <= 15 && IntResult >= -16)
19358 return true;
19359 return Imm.isZero();
19360 }
19361 case MVT::ppcf128:
19362 return Imm.isPosZero();
19363 }
19364}
19365
19366// For vector shift operation op, fold
19367// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
19368static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
19369 SelectionDAG &DAG) {
19370 SDValue N0 = N->getOperand(Num: 0);
19371 SDValue N1 = N->getOperand(Num: 1);
19372 EVT VT = N0.getValueType();
19373 unsigned OpSizeInBits = VT.getScalarSizeInBits();
19374 unsigned Opcode = N->getOpcode();
19375 unsigned TargetOpcode;
19376
19377 switch (Opcode) {
19378 default:
19379 llvm_unreachable("Unexpected shift operation");
19380 case ISD::SHL:
19381 TargetOpcode = PPCISD::SHL;
19382 break;
19383 case ISD::SRL:
19384 TargetOpcode = PPCISD::SRL;
19385 break;
19386 case ISD::SRA:
19387 TargetOpcode = PPCISD::SRA;
19388 break;
19389 }
19390
19391 if (VT.isVector() && TLI.isOperationLegal(Op: Opcode, VT) &&
19392 N1->getOpcode() == ISD::AND)
19393 if (ConstantSDNode *Mask = isConstOrConstSplat(N: N1->getOperand(Num: 1)))
19394 if (Mask->getZExtValue() == OpSizeInBits - 1)
19395 return DAG.getNode(Opcode: TargetOpcode, DL: SDLoc(N), VT, N1: N0, N2: N1->getOperand(Num: 0));
19396
19397 return SDValue();
19398}
19399
19400SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
19401 DAGCombinerInfo &DCI) const {
19402 EVT VT = N->getValueType(ResNo: 0);
19403 assert(VT.isVector() && "Vector type expected.");
19404
19405 unsigned Opc = N->getOpcode();
19406 assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
19407 "Unexpected opcode.");
19408
19409 if (!isOperationLegal(Op: Opc, VT))
19410 return SDValue();
19411
19412 EVT EltTy = VT.getScalarType();
19413 unsigned EltBits = EltTy.getSizeInBits();
19414 if (EltTy != MVT::i64 && EltTy != MVT::i32)
19415 return SDValue();
19416
19417 SDValue N1 = N->getOperand(Num: 1);
19418 uint64_t SplatBits = 0;
19419 bool AddSplatCase = false;
19420 unsigned OpcN1 = N1.getOpcode();
19421 if (OpcN1 == PPCISD::VADD_SPLAT &&
19422 N1.getConstantOperandVal(i: 1) == VT.getVectorNumElements()) {
19423 AddSplatCase = true;
19424 SplatBits = N1.getConstantOperandVal(i: 0);
19425 }
19426
19427 if (!AddSplatCase) {
19428 if (OpcN1 != ISD::BUILD_VECTOR)
19429 return SDValue();
19430
19431 unsigned SplatBitSize;
19432 bool HasAnyUndefs;
19433 APInt APSplatBits, APSplatUndef;
19434 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val&: N1);
19435 bool BVNIsConstantSplat =
19436 BVN->isConstantSplat(SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize,
19437 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian());
19438 if (!BVNIsConstantSplat || SplatBitSize != EltBits)
19439 return SDValue();
19440 SplatBits = APSplatBits.getZExtValue();
19441 }
19442
19443 SDLoc DL(N);
19444 SDValue N0 = N->getOperand(Num: 0);
19445 // PPC vector shifts by word/double look at only the low 5/6 bits of the
19446 // shift vector, which means the max value is 31/63. A shift vector of all
19447 // 1s will be truncated to 31/63, which is useful as vspltiw is limited to
19448 // -16 to 15 range.
19449 if (SplatBits == (EltBits - 1)) {
19450 unsigned NewOpc;
19451 switch (Opc) {
19452 case ISD::SHL:
19453 NewOpc = PPCISD::SHL;
19454 break;
19455 case ISD::SRL:
19456 NewOpc = PPCISD::SRL;
19457 break;
19458 case ISD::SRA:
19459 NewOpc = PPCISD::SRA;
19460 break;
19461 }
19462 SDValue SplatOnes = getCanonicalConstSplat(Val: 255, SplatSize: 1, VT, DAG&: DCI.DAG, dl: DL);
19463 return DCI.DAG.getNode(Opcode: NewOpc, DL, VT, N1: N0, N2: SplatOnes);
19464 }
19465
19466 if (Opc != ISD::SHL || !isOperationLegal(Op: ISD::ADD, VT))
19467 return SDValue();
19468
19469 // For 64-bit there is no splat immediate so we want to catch shift by 1 here
19470 // before the BUILD_VECTOR is replaced by a load.
19471 if (EltTy != MVT::i64 || SplatBits != 1)
19472 return SDValue();
19473
19474 return DCI.DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N), VT, N1: N0, N2: N0);
19475}
19476
19477SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
19478 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19479 return Value;
19480
19481 if (N->getValueType(ResNo: 0).isVector())
19482 return combineVectorShift(N, DCI);
19483
19484 SDValue N0 = N->getOperand(Num: 0);
19485 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
19486 if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
19487 N0.getOpcode() != ISD::SIGN_EXTEND ||
19488 N0.getOperand(i: 0).getValueType() != MVT::i32 || CN1 == nullptr ||
19489 N->getValueType(ResNo: 0) != MVT::i64)
19490 return SDValue();
19491
19492 // We can't save an operation here if the value is already extended, and
19493 // the existing shift is easier to combine.
19494 SDValue ExtsSrc = N0.getOperand(i: 0);
19495 if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
19496 ExtsSrc.getOperand(i: 0).getOpcode() == ISD::AssertSext)
19497 return SDValue();
19498
19499 SDLoc DL(N0);
19500 SDValue ShiftBy = SDValue(CN1, 0);
19501 // We want the shift amount to be i32 on the extswli, but the shift could
19502 // have an i64.
19503 if (ShiftBy.getValueType() == MVT::i64)
19504 ShiftBy = DCI.DAG.getConstant(Val: CN1->getZExtValue(), DL, VT: MVT::i32);
19505
19506 return DCI.DAG.getNode(Opcode: PPCISD::EXTSWSLI, DL, VT: MVT::i64, N1: N0->getOperand(Num: 0),
19507 N2: ShiftBy);
19508}
19509
19510SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
19511 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19512 return Value;
19513
19514 if (N->getValueType(ResNo: 0).isVector())
19515 return combineVectorShift(N, DCI);
19516
19517 return SDValue();
19518}
19519
19520SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
19521 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19522 return Value;
19523
19524 if (N->getValueType(ResNo: 0).isVector())
19525 return combineVectorShift(N, DCI);
19526
19527 return SDValue();
19528}
19529
19530// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
19531// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
19532// When C is zero, the equation (addi Z, -C) can be simplified to Z
19533// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
19534static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
19535 const PPCSubtarget &Subtarget) {
19536 if (!Subtarget.isPPC64())
19537 return SDValue();
19538
19539 SDValue LHS = N->getOperand(Num: 0);
19540 SDValue RHS = N->getOperand(Num: 1);
19541
19542 auto isZextOfCompareWithConstant = [](SDValue Op) {
19543 if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
19544 Op.getValueType() != MVT::i64)
19545 return false;
19546
19547 SDValue Cmp = Op.getOperand(i: 0);
19548 if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
19549 Cmp.getOperand(i: 0).getValueType() != MVT::i64)
19550 return false;
19551
19552 if (auto *Constant = dyn_cast<ConstantSDNode>(Val: Cmp.getOperand(i: 1))) {
19553 int64_t NegConstant = 0 - Constant->getSExtValue();
19554 // Due to the limitations of the addi instruction,
19555 // -C is required to be [-32768, 32767].
19556 return isInt<16>(x: NegConstant);
19557 }
19558
19559 return false;
19560 };
19561
19562 bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
19563 bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
19564
19565 // If there is a pattern, canonicalize a zext operand to the RHS.
19566 if (LHSHasPattern && !RHSHasPattern)
19567 std::swap(a&: LHS, b&: RHS);
19568 else if (!LHSHasPattern && !RHSHasPattern)
19569 return SDValue();
19570
19571 SDLoc DL(N);
19572 EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
19573 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: CarryType);
19574 SDValue Cmp = RHS.getOperand(i: 0);
19575 SDValue Z = Cmp.getOperand(i: 0);
19576 auto *Constant = cast<ConstantSDNode>(Val: Cmp.getOperand(i: 1));
19577 int64_t NegConstant = 0 - Constant->getSExtValue();
19578
19579 switch(cast<CondCodeSDNode>(Val: Cmp.getOperand(i: 2))->get()) {
19580 default: break;
19581 case ISD::SETNE: {
19582 // when C == 0
19583 // --> addze X, (addic Z, -1).carry
19584 // /
19585 // add X, (zext(setne Z, C))--
19586 // \ when -32768 <= -C <= 32767 && C != 0
19587 // --> addze X, (addic (addi Z, -C), -1).carry
19588 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Z,
19589 N2: DAG.getConstant(Val: NegConstant, DL, VT: MVT::i64));
19590 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
19591 SDValue Addc =
19592 DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: CarryType),
19593 N1: AddOrZ, N2: DAG.getAllOnesConstant(DL, VT: MVT::i64),
19594 N3: DAG.getConstant(Val: 0, DL, VT: CarryType));
19595 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: VTs, N1: LHS,
19596 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64),
19597 N3: SDValue(Addc.getNode(), 1));
19598 }
19599 case ISD::SETEQ: {
19600 // when C == 0
19601 // --> addze X, (subfic Z, 0).carry
19602 // /
19603 // add X, (zext(sete Z, C))--
19604 // \ when -32768 <= -C <= 32767 && C != 0
19605 // --> addze X, (subfic (addi Z, -C), 0).carry
19606 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Z,
19607 N2: DAG.getConstant(Val: NegConstant, DL, VT: MVT::i64));
19608 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
19609 SDValue Subc =
19610 DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: CarryType),
19611 N1: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N2: AddOrZ,
19612 N3: DAG.getConstant(Val: 0, DL, VT: CarryType));
19613 SDValue Invert = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryType, N1: Subc.getValue(R: 1),
19614 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryType));
19615 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: VTs, N1: LHS,
19616 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N3: Invert);
19617 }
19618 }
19619
19620 return SDValue();
19621}
19622
19623// Transform
19624// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
19625// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
19626// In this case both C1 and C2 must be known constants.
19627// C1+C2 must fit into a 34 bit signed integer.
19628static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
19629 const PPCSubtarget &Subtarget) {
19630 if (!Subtarget.isUsingPCRelativeCalls())
19631 return SDValue();
19632
19633 // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
19634 // If we find that node try to cast the Global Address and the Constant.
19635 SDValue LHS = N->getOperand(Num: 0);
19636 SDValue RHS = N->getOperand(Num: 1);
19637
19638 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
19639 std::swap(a&: LHS, b&: RHS);
19640
19641 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
19642 return SDValue();
19643
19644 // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
19645 GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(Val: LHS.getOperand(i: 0));
19646 ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(Val&: RHS);
19647
19648 // Check that both casts succeeded.
19649 if (!GSDN || !ConstNode)
19650 return SDValue();
19651
19652 int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
19653 SDLoc DL(GSDN);
19654
19655 // The signed int offset needs to fit in 34 bits.
19656 if (!isInt<34>(x: NewOffset))
19657 return SDValue();
19658
19659 // The new global address is a copy of the old global address except
19660 // that it has the updated Offset.
19661 SDValue GA =
19662 DAG.getTargetGlobalAddress(GV: GSDN->getGlobal(), DL, VT: GSDN->getValueType(ResNo: 0),
19663 offset: NewOffset, TargetFlags: GSDN->getTargetFlags());
19664 SDValue MatPCRel =
19665 DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: GSDN->getValueType(ResNo: 0), Operand: GA);
19666 return MatPCRel;
19667}
19668
19669// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes))
19670// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv
19671// Mathematical identity: X + 1 = X - (-1)
19672// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1
19673// Requirement: VSX feature for efficient xxleqv generation
19674static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
19675 const PPCSubtarget &Subtarget) {
19676
19677 EVT VT = N->getValueType(ResNo: 0);
19678 if (!Subtarget.hasVSX())
19679 return SDValue();
19680
19681 // Handle v2i64, v4i32, v8i16 and v16i8 types
19682 if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
19683 VT == MVT::v2i64))
19684 return SDValue();
19685
19686 SDValue LHS = N->getOperand(Num: 0);
19687 SDValue RHS = N->getOperand(Num: 1);
19688
19689 // Check if RHS is BUILD_VECTOR
19690 if (RHS.getOpcode() != ISD::BUILD_VECTOR)
19691 return SDValue();
19692
19693 // Check if all the elements are 1
19694 unsigned NumOfEles = RHS.getNumOperands();
19695 for (unsigned i = 0; i < NumOfEles; ++i) {
19696 auto *CN = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i));
19697 if (!CN || CN->getSExtValue() != 1)
19698 return SDValue();
19699 }
19700 SDLoc DL(N);
19701
19702 SDValue MinusOne = DAG.getConstant(Val: APInt::getAllOnes(numBits: 32), DL, VT: MVT::i32);
19703 SmallVector<SDValue, 4> Ops(4, MinusOne);
19704 SDValue AllOnesVec = DAG.getBuildVector(VT: MVT::v4i32, DL, Ops);
19705
19706 // Bitcast to the target vector type
19707 SDValue Bitcast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: AllOnesVec);
19708
19709 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Bitcast);
19710}
19711
19712SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
19713 if (auto Value = combineADDToADDZE(N, DAG&: DCI.DAG, Subtarget))
19714 return Value;
19715
19716 if (auto Value = combineADDToMAT_PCREL_ADDR(N, DAG&: DCI.DAG, Subtarget))
19717 return Value;
19718
19719 if (auto Value = combineADDToSUB(N, DAG&: DCI.DAG, Subtarget))
19720 return Value;
19721 return SDValue();
19722}
19723
19724// Detect TRUNCATE operations on bitcasts of float128 values.
19725// What we are looking for here is the situtation where we extract a subset
19726// of bits from a 128 bit float.
19727// This can be of two forms:
19728// 1) BITCAST of f128 feeding TRUNCATE
19729// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
19730// The reason this is required is because we do not have a legal i128 type
19731// and so we want to prevent having to store the f128 and then reload part
19732// of it.
19733SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
19734 DAGCombinerInfo &DCI) const {
19735 // If we are using CRBits then try that first.
19736 if (Subtarget.useCRBits()) {
19737 // Check if CRBits did anything and return that if it did.
19738 if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
19739 return CRTruncValue;
19740 }
19741
19742 SDLoc dl(N);
19743 SDValue Op0 = N->getOperand(Num: 0);
19744
19745 // Looking for a truncate of i128 to i64.
19746 if (Op0.getValueType() != MVT::i128 || N->getValueType(ResNo: 0) != MVT::i64)
19747 return SDValue();
19748
19749 int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
19750
19751 // SRL feeding TRUNCATE.
19752 if (Op0.getOpcode() == ISD::SRL) {
19753 ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Val: Op0.getOperand(i: 1));
19754 // The right shift has to be by 64 bits.
19755 if (!ConstNode || ConstNode->getZExtValue() != 64)
19756 return SDValue();
19757
19758 // Switch the element number to extract.
19759 EltToExtract = EltToExtract ? 0 : 1;
19760 // Update Op0 past the SRL.
19761 Op0 = Op0.getOperand(i: 0);
19762 }
19763
19764 // BITCAST feeding a TRUNCATE possibly via SRL.
19765 if (Op0.getOpcode() == ISD::BITCAST &&
19766 Op0.getValueType() == MVT::i128 &&
19767 Op0.getOperand(i: 0).getValueType() == MVT::f128) {
19768 SDValue Bitcast = DCI.DAG.getBitcast(VT: MVT::v2i64, V: Op0.getOperand(i: 0));
19769 return DCI.DAG.getNode(
19770 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Bitcast,
19771 N2: DCI.DAG.getTargetConstant(Val: EltToExtract, DL: dl, VT: MVT::i32));
19772 }
19773 return SDValue();
19774}
19775
19776SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
19777 SelectionDAG &DAG = DCI.DAG;
19778
19779 ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N: N->getOperand(Num: 1));
19780 if (!ConstOpOrElement)
19781 return SDValue();
19782
19783 // An imul is usually smaller than the alternative sequence for legal type.
19784 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
19785 isOperationLegal(Op: ISD::MUL, VT: N->getValueType(ResNo: 0)))
19786 return SDValue();
19787
19788 auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
19789 switch (this->Subtarget.getCPUDirective()) {
19790 default:
19791 // TODO: enhance the condition for subtarget before pwr8
19792 return false;
19793 case PPC::DIR_PWR8:
19794 // type mul add shl
19795 // scalar 4 1 1
19796 // vector 7 2 2
19797 return true;
19798 case PPC::DIR_PWR9:
19799 case PPC::DIR_PWR10:
19800 case PPC::DIR_PWR11:
19801 case PPC::DIR_PWR_FUTURE:
19802 // type mul add shl
19803 // scalar 5 2 2
19804 // vector 7 2 2
19805
19806 // The cycle RATIO of related operations are showed as a table above.
19807 // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
19808 // scalar and vector type. For 2 instrs patterns, add/sub + shl
19809 // are 4, it is always profitable; but for 3 instrs patterns
19810 // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
19811 // So we should only do it for vector type.
19812 return IsAddOne && IsNeg ? VT.isVector() : true;
19813 }
19814 };
19815
19816 EVT VT = N->getValueType(ResNo: 0);
19817 SDLoc DL(N);
19818
19819 const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
19820 bool IsNeg = MulAmt.isNegative();
19821 APInt MulAmtAbs = MulAmt.abs();
19822
19823 if ((MulAmtAbs - 1).isPowerOf2()) {
19824 // (mul x, 2^N + 1) => (add (shl x, N), x)
19825 // (mul x, -(2^N + 1)) => -(add (shl x, N), x)
19826
19827 if (!IsProfitable(IsNeg, true, VT))
19828 return SDValue();
19829
19830 SDValue Op0 = N->getOperand(Num: 0);
19831 SDValue Op1 =
19832 DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N->getOperand(Num: 0),
19833 N2: DAG.getConstant(Val: (MulAmtAbs - 1).logBase2(), DL, VT));
19834 SDValue Res = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Op0, N2: Op1);
19835
19836 if (!IsNeg)
19837 return Res;
19838
19839 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Res);
19840 } else if ((MulAmtAbs + 1).isPowerOf2()) {
19841 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19842 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19843
19844 if (!IsProfitable(IsNeg, false, VT))
19845 return SDValue();
19846
19847 SDValue Op0 = N->getOperand(Num: 0);
19848 SDValue Op1 =
19849 DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N->getOperand(Num: 0),
19850 N2: DAG.getConstant(Val: (MulAmtAbs + 1).logBase2(), DL, VT));
19851
19852 if (!IsNeg)
19853 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Op1, N2: Op0);
19854 else
19855 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Op0, N2: Op1);
19856
19857 } else {
19858 return SDValue();
19859 }
19860}
19861
19862// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
19863// in combiner since we need to check SD flags and other subtarget features.
19864SDValue PPCTargetLowering::combineFMALike(SDNode *N,
19865 DAGCombinerInfo &DCI) const {
19866 SDValue N0 = N->getOperand(Num: 0);
19867 SDValue N1 = N->getOperand(Num: 1);
19868 SDValue N2 = N->getOperand(Num: 2);
19869 SDNodeFlags Flags = N->getFlags();
19870 EVT VT = N->getValueType(ResNo: 0);
19871 SelectionDAG &DAG = DCI.DAG;
19872 const TargetOptions &Options = getTargetMachine().Options;
19873 unsigned Opc = N->getOpcode();
19874 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
19875 bool LegalOps = !DCI.isBeforeLegalizeOps();
19876 SDLoc Loc(N);
19877
19878 if (!isOperationLegal(Op: ISD::FMA, VT))
19879 return SDValue();
19880
19881 // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
19882 // since (fnmsub a b c)=-0 while c-ab=+0.
19883 if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
19884 return SDValue();
19885
19886 // (fma (fneg a) b c) => (fnmsub a b c)
19887 // (fnmsub (fneg a) b c) => (fma a b c)
19888 if (SDValue NegN0 = getCheaperNegatedExpression(Op: N0, DAG, LegalOps, OptForSize: CodeSize))
19889 return DAG.getNode(Opcode: invertFMAOpcode(Opc), DL: Loc, VT, N1: NegN0, N2: N1, N3: N2, Flags);
19890
19891 // (fma a (fneg b) c) => (fnmsub a b c)
19892 // (fnmsub a (fneg b) c) => (fma a b c)
19893 if (SDValue NegN1 = getCheaperNegatedExpression(Op: N1, DAG, LegalOps, OptForSize: CodeSize))
19894 return DAG.getNode(Opcode: invertFMAOpcode(Opc), DL: Loc, VT, N1: N0, N2: NegN1, N3: N2, Flags);
19895
19896 return SDValue();
19897}
19898
19899bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
19900 // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
19901 if (!Subtarget.is64BitELFABI())
19902 return false;
19903
19904 // If not a tail call then no need to proceed.
19905 if (!CI->isTailCall())
19906 return false;
19907
19908 // If sibling calls have been disabled and tail-calls aren't guaranteed
19909 // there is no reason to duplicate.
19910 auto &TM = getTargetMachine();
19911 if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
19912 return false;
19913
19914 // Can't tail call a function called indirectly, or if it has variadic args.
19915 const Function *Callee = CI->getCalledFunction();
19916 if (!Callee || Callee->isVarArg())
19917 return false;
19918
19919 // Make sure the callee and caller calling conventions are eligible for tco.
19920 const Function *Caller = CI->getParent()->getParent();
19921 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC: Caller->getCallingConv(),
19922 CalleeCC: CI->getCallingConv()))
19923 return false;
19924
19925 // If the function is local then we have a good chance at tail-calling it
19926 return getTargetMachine().shouldAssumeDSOLocal(GV: Callee);
19927}
19928
19929bool PPCTargetLowering::
19930isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
19931 const Value *Mask = AndI.getOperand(i: 1);
19932 // If the mask is suitable for andi. or andis. we should sink the and.
19933 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val: Mask)) {
19934 // Can't handle constants wider than 64-bits.
19935 if (CI->getBitWidth() > 64)
19936 return false;
19937 int64_t ConstVal = CI->getZExtValue();
19938 return isUInt<16>(x: ConstVal) ||
19939 (isUInt<16>(x: ConstVal >> 16) && !(ConstVal & 0xFFFF));
19940 }
19941
19942 // For non-constant masks, we can always use the record-form and.
19943 return true;
19944}
19945
19946/// getAddrModeForFlags - Based on the set of address flags, select the most
19947/// optimal instruction format to match by.
19948PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
19949 // This is not a node we should be handling here.
19950 if (Flags == PPC::MOF_None)
19951 return PPC::AM_None;
19952 // Unaligned D-Forms are tried first, followed by the aligned D-Forms.
19953 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DForm))
19954 if ((Flags & FlagSet) == FlagSet)
19955 return PPC::AM_DForm;
19956 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DSForm))
19957 if ((Flags & FlagSet) == FlagSet)
19958 return PPC::AM_DSForm;
19959 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DQForm))
19960 if ((Flags & FlagSet) == FlagSet)
19961 return PPC::AM_DQForm;
19962 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_PrefixDForm))
19963 if ((Flags & FlagSet) == FlagSet)
19964 return PPC::AM_PrefixDForm;
19965 // If no other forms are selected, return an X-Form as it is the most
19966 // general addressing mode.
19967 return PPC::AM_XForm;
19968}
19969
19970/// Set alignment flags based on whether or not the Frame Index is aligned.
19971/// Utilized when computing flags for address computation when selecting
19972/// load and store instructions.
19973static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,
19974 SelectionDAG &DAG) {
19975 bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));
19976 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: IsAdd ? N.getOperand(i: 0) : N);
19977 if (!FI)
19978 return;
19979 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19980 unsigned FrameIndexAlign = MFI.getObjectAlign(ObjectIdx: FI->getIndex()).value();
19981 // If this is (add $FI, $S16Imm), the alignment flags are already set
19982 // based on the immediate. We just need to clear the alignment flags
19983 // if the FI alignment is weaker.
19984 if ((FrameIndexAlign % 4) != 0)
19985 FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;
19986 if ((FrameIndexAlign % 16) != 0)
19987 FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;
19988 // If the address is a plain FrameIndex, set alignment flags based on
19989 // FI alignment.
19990 if (!IsAdd) {
19991 if ((FrameIndexAlign % 4) == 0)
19992 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
19993 if ((FrameIndexAlign % 16) == 0)
19994 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
19995 }
19996}
19997
19998/// Given a node, compute flags that are used for address computation when
19999/// selecting load and store instructions. The flags computed are stored in
20000/// FlagSet. This function takes into account whether the node is a constant,
20001/// an ADD, OR, or a constant, and computes the address flags accordingly.
20002static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
20003 SelectionDAG &DAG) {
20004 // Set the alignment flags for the node depending on if the node is
20005 // 4-byte or 16-byte aligned.
20006 auto SetAlignFlagsForImm = [&](uint64_t Imm) {
20007 if ((Imm & 0x3) == 0)
20008 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
20009 if ((Imm & 0xf) == 0)
20010 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
20011 };
20012
20013 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: N)) {
20014 // All 32-bit constants can be computed as LIS + Disp.
20015 const APInt &ConstImm = CN->getAPIntValue();
20016 if (ConstImm.isSignedIntN(N: 32)) { // Flag to handle 32-bit constants.
20017 FlagSet |= PPC::MOF_AddrIsSImm32;
20018 SetAlignFlagsForImm(ConstImm.getZExtValue());
20019 setAlignFlagsForFI(N, FlagSet, DAG);
20020 }
20021 if (ConstImm.isSignedIntN(N: 34)) // Flag to handle 34-bit constants.
20022 FlagSet |= PPC::MOF_RPlusSImm34;
20023 else // Let constant materialization handle large constants.
20024 FlagSet |= PPC::MOF_NotAddNorCst;
20025 } else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {
20026 // This address can be represented as an addition of:
20027 // - Register + Imm16 (possibly a multiple of 4/16)
20028 // - Register + Imm34
20029 // - Register + PPCISD::Lo
20030 // - Register + Register
20031 // In any case, we won't have to match this as Base + Zero.
20032 SDValue RHS = N.getOperand(i: 1);
20033 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: RHS)) {
20034 const APInt &ConstImm = CN->getAPIntValue();
20035 if (ConstImm.isSignedIntN(N: 16)) {
20036 FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.
20037 SetAlignFlagsForImm(ConstImm.getZExtValue());
20038 setAlignFlagsForFI(N, FlagSet, DAG);
20039 }
20040 if (ConstImm.isSignedIntN(N: 34))
20041 FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
20042 else
20043 FlagSet |= PPC::MOF_RPlusR; // Register.
20044 } else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(i: 1))
20045 FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
20046 else
20047 FlagSet |= PPC::MOF_RPlusR;
20048 } else { // The address computation is not a constant or an addition.
20049 setAlignFlagsForFI(N, FlagSet, DAG);
20050 FlagSet |= PPC::MOF_NotAddNorCst;
20051 }
20052}
20053
20054static bool isPCRelNode(SDValue N) {
20055 return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||
20056 isValidPCRelNode<ConstantPoolSDNode>(N) ||
20057 isValidPCRelNode<GlobalAddressSDNode>(N) ||
20058 isValidPCRelNode<JumpTableSDNode>(N) ||
20059 isValidPCRelNode<BlockAddressSDNode>(N));
20060}
20061
20062/// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
20063/// the address flags of the load/store instruction that is to be matched.
20064unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
20065 SelectionDAG &DAG) const {
20066 unsigned FlagSet = PPC::MOF_None;
20067
20068 // Compute subtarget flags.
20069 if (!Subtarget.hasP9Vector())
20070 FlagSet |= PPC::MOF_SubtargetBeforeP9;
20071 else
20072 FlagSet |= PPC::MOF_SubtargetP9;
20073
20074 if (Subtarget.hasPrefixInstrs())
20075 FlagSet |= PPC::MOF_SubtargetP10;
20076
20077 if (Subtarget.hasSPE())
20078 FlagSet |= PPC::MOF_SubtargetSPE;
20079
20080 // Check if we have a PCRel node and return early.
20081 if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))
20082 return FlagSet;
20083
20084 // If the node is the paired load/store intrinsics, compute flags for
20085 // address computation and return early.
20086 unsigned ParentOp = Parent->getOpcode();
20087 if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
20088 (ParentOp == ISD::INTRINSIC_VOID))) {
20089 unsigned ID = Parent->getConstantOperandVal(Num: 1);
20090 if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
20091 SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
20092 ? Parent->getOperand(Num: 2)
20093 : Parent->getOperand(Num: 3);
20094 computeFlagsForAddressComputation(N: IntrinOp, FlagSet, DAG);
20095 FlagSet |= PPC::MOF_Vector;
20096 return FlagSet;
20097 }
20098 }
20099
20100 // Mark this as something we don't want to handle here if it is atomic
20101 // or pre-increment instruction.
20102 if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Val: Parent))
20103 if (LSB->isIndexed())
20104 return PPC::MOF_None;
20105
20106 // Compute in-memory type flags. This is based on if there are scalars,
20107 // floats or vectors.
20108 const MemSDNode *MN = dyn_cast<MemSDNode>(Val: Parent);
20109 assert(MN && "Parent should be a MemSDNode!");
20110 EVT MemVT = MN->getMemoryVT();
20111 unsigned Size = MemVT.getSizeInBits();
20112 if (MemVT.isScalarInteger()) {
20113 assert(Size <= 128 &&
20114 "Not expecting scalar integers larger than 16 bytes!");
20115 if (Size < 32)
20116 FlagSet |= PPC::MOF_SubWordInt;
20117 else if (Size == 32)
20118 FlagSet |= PPC::MOF_WordInt;
20119 else
20120 FlagSet |= PPC::MOF_DoubleWordInt;
20121 } else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
20122 if (Size == 128)
20123 FlagSet |= PPC::MOF_Vector;
20124 else if (Size == 256) {
20125 assert(Subtarget.pairedVectorMemops() &&
20126 "256-bit vectors are only available when paired vector memops is "
20127 "enabled!");
20128 FlagSet |= PPC::MOF_Vector;
20129 } else
20130 llvm_unreachable("Not expecting illegal vectors!");
20131 } else { // Floating point type: can be scalar, f128 or vector types.
20132 if (Size == 32 || Size == 64)
20133 FlagSet |= PPC::MOF_ScalarFloat;
20134 else if (MemVT == MVT::f128 || MemVT.isVector())
20135 FlagSet |= PPC::MOF_Vector;
20136 else
20137 llvm_unreachable("Not expecting illegal scalar floats!");
20138 }
20139
20140 // Compute flags for address computation.
20141 computeFlagsForAddressComputation(N, FlagSet, DAG);
20142
20143 // Compute type extension flags.
20144 if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Val: Parent)) {
20145 switch (LN->getExtensionType()) {
20146 case ISD::SEXTLOAD:
20147 FlagSet |= PPC::MOF_SExt;
20148 break;
20149 case ISD::EXTLOAD:
20150 case ISD::ZEXTLOAD:
20151 FlagSet |= PPC::MOF_ZExt;
20152 break;
20153 case ISD::NON_EXTLOAD:
20154 FlagSet |= PPC::MOF_NoExt;
20155 break;
20156 }
20157 } else
20158 FlagSet |= PPC::MOF_NoExt;
20159
20160 // For integers, no extension is the same as zero extension.
20161 // We set the extension mode to zero extension so we don't have
20162 // to add separate entries in AddrModesMap for loads and stores.
20163 if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {
20164 FlagSet |= PPC::MOF_ZExt;
20165 FlagSet &= ~PPC::MOF_NoExt;
20166 }
20167
20168 // If we don't have prefixed instructions, 34-bit constants should be
20169 // treated as PPC::MOF_NotAddNorCst so they can match D-Forms.
20170 bool IsNonP1034BitConst =
20171 ((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &
20172 FlagSet) == PPC::MOF_RPlusSImm34;
20173 if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&
20174 IsNonP1034BitConst)
20175 FlagSet |= PPC::MOF_NotAddNorCst;
20176
20177 return FlagSet;
20178}
20179
20180/// SelectForceXFormMode - Given the specified address, force it to be
20181/// represented as an indexed [r+r] operation (an XForm instruction).
20182PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
20183 SDValue &Base,
20184 SelectionDAG &DAG) const {
20185
20186 PPC::AddrMode Mode = PPC::AM_XForm;
20187 int16_t ForceXFormImm = 0;
20188 if (provablyDisjointOr(DAG, N) &&
20189 !isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: ForceXFormImm)) {
20190 Disp = N.getOperand(i: 0);
20191 Base = N.getOperand(i: 1);
20192 return Mode;
20193 }
20194
20195 // If the address is the result of an add, we will utilize the fact that the
20196 // address calculation includes an implicit add. However, we can reduce
20197 // register pressure if we do not materialize a constant just for use as the
20198 // index register. We only get rid of the add if it is not an add of a
20199 // value and a 16-bit signed constant and both have a single use.
20200 if (N.getOpcode() == ISD::ADD &&
20201 (!isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: ForceXFormImm) ||
20202 !N.getOperand(i: 1).hasOneUse() || !N.getOperand(i: 0).hasOneUse())) {
20203 Disp = N.getOperand(i: 0);
20204 Base = N.getOperand(i: 1);
20205 return Mode;
20206 }
20207
20208 // Otherwise, use R0 as the base register.
20209 Disp = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20210 VT: N.getValueType());
20211 Base = N;
20212
20213 return Mode;
20214}
20215
20216bool PPCTargetLowering::splitValueIntoRegisterParts(
20217 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
20218 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
20219 EVT ValVT = Val.getValueType();
20220 // If we are splitting a scalar integer into f64 parts (i.e. so they
20221 // can be placed into VFRC registers), we need to zero extend and
20222 // bitcast the values. This will ensure the value is placed into a
20223 // VSR using direct moves or stack operations as needed.
20224 if (PartVT == MVT::f64 &&
20225 (ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {
20226 Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
20227 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Val);
20228 Parts[0] = Val;
20229 return true;
20230 }
20231 return false;
20232}
20233
20234SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
20235 SelectionDAG &DAG) const {
20236 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20237 TargetLowering::CallLoweringInfo CLI(DAG);
20238 EVT RetVT = Op.getValueType();
20239 Type *RetTy = RetVT.getTypeForEVT(Context&: *DAG.getContext());
20240 SDValue Callee =
20241 DAG.getExternalSymbol(Sym: LibCallName, VT: TLI.getPointerTy(DL: DAG.getDataLayout()));
20242 bool SignExtend = TLI.shouldSignExtendTypeInLibCall(Ty: RetTy, IsSigned: false);
20243 TargetLowering::ArgListTy Args;
20244 for (const SDValue &N : Op->op_values()) {
20245 EVT ArgVT = N.getValueType();
20246 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
20247 TargetLowering::ArgListEntry Entry(N, ArgTy);
20248 Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(Ty: ArgTy, IsSigned: SignExtend);
20249 Entry.IsZExt = !Entry.IsSExt;
20250 Args.push_back(x: Entry);
20251 }
20252
20253 SDValue InChain = DAG.getEntryNode();
20254 SDValue TCChain = InChain;
20255 const Function &F = DAG.getMachineFunction().getFunction();
20256 bool isTailCall =
20257 TLI.isInTailCallPosition(DAG, Node: Op.getNode(), Chain&: TCChain) &&
20258 (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
20259 if (isTailCall)
20260 InChain = TCChain;
20261 CLI.setDebugLoc(SDLoc(Op))
20262 .setChain(InChain)
20263 .setLibCallee(CC: CallingConv::C, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args))
20264 .setTailCall(isTailCall)
20265 .setSExtResult(SignExtend)
20266 .setZExtResult(!SignExtend)
20267 .setIsPostTypeLegalization(true);
20268 return TLI.LowerCallTo(CLI).first;
20269}
20270
20271SDValue PPCTargetLowering::lowerLibCallBasedOnType(
20272 const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,
20273 SelectionDAG &DAG) const {
20274 if (Op.getValueType() == MVT::f32)
20275 return lowerToLibCall(LibCallName: LibCallFloatName, Op, DAG);
20276
20277 if (Op.getValueType() == MVT::f64)
20278 return lowerToLibCall(LibCallName: LibCallDoubleName, Op, DAG);
20279
20280 return SDValue();
20281}
20282
20283bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {
20284 SDNodeFlags Flags = Op.getNode()->getFlags();
20285 return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&
20286 Flags.hasNoNaNs() && Flags.hasNoInfs();
20287}
20288
20289bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
20290 return Op.getNode()->getFlags().hasApproximateFuncs();
20291}
20292
20293bool PPCTargetLowering::isScalarMASSConversionEnabled() const {
20294 return getTargetMachine().Options.PPCGenScalarMASSEntries;
20295}
20296
20297SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
20298 const char *LibCallFloatName,
20299 const char *LibCallDoubleNameFinite,
20300 const char *LibCallFloatNameFinite,
20301 SDValue Op,
20302 SelectionDAG &DAG) const {
20303 if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))
20304 return SDValue();
20305
20306 if (!isLowringToMASSFiniteSafe(Op))
20307 return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,
20308 DAG);
20309
20310 return lowerLibCallBasedOnType(LibCallFloatName: LibCallFloatNameFinite,
20311 LibCallDoubleName: LibCallDoubleNameFinite, Op, DAG);
20312}
20313
20314SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {
20315 return lowerLibCallBase(LibCallDoubleName: "__xl_pow", LibCallFloatName: "__xl_powf", LibCallDoubleNameFinite: "__xl_pow_finite",
20316 LibCallFloatNameFinite: "__xl_powf_finite", Op, DAG);
20317}
20318
20319SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {
20320 return lowerLibCallBase(LibCallDoubleName: "__xl_sin", LibCallFloatName: "__xl_sinf", LibCallDoubleNameFinite: "__xl_sin_finite",
20321 LibCallFloatNameFinite: "__xl_sinf_finite", Op, DAG);
20322}
20323
20324SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {
20325 return lowerLibCallBase(LibCallDoubleName: "__xl_cos", LibCallFloatName: "__xl_cosf", LibCallDoubleNameFinite: "__xl_cos_finite",
20326 LibCallFloatNameFinite: "__xl_cosf_finite", Op, DAG);
20327}
20328
20329SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {
20330 return lowerLibCallBase(LibCallDoubleName: "__xl_log", LibCallFloatName: "__xl_logf", LibCallDoubleNameFinite: "__xl_log_finite",
20331 LibCallFloatNameFinite: "__xl_logf_finite", Op, DAG);
20332}
20333
20334SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {
20335 return lowerLibCallBase(LibCallDoubleName: "__xl_log10", LibCallFloatName: "__xl_log10f", LibCallDoubleNameFinite: "__xl_log10_finite",
20336 LibCallFloatNameFinite: "__xl_log10f_finite", Op, DAG);
20337}
20338
20339SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {
20340 return lowerLibCallBase(LibCallDoubleName: "__xl_exp", LibCallFloatName: "__xl_expf", LibCallDoubleNameFinite: "__xl_exp_finite",
20341 LibCallFloatNameFinite: "__xl_expf_finite", Op, DAG);
20342}
20343
20344// If we happen to match to an aligned D-Form, check if the Frame Index is
20345// adequately aligned. If it is not, reset the mode to match to X-Form.
20346static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
20347 PPC::AddrMode &Mode) {
20348 if (!isa<FrameIndexSDNode>(Val: N))
20349 return;
20350 if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||
20351 (Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))
20352 Mode = PPC::AM_XForm;
20353}
20354
20355/// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
20356/// compute the address flags of the node, get the optimal address mode based
20357/// on the flags, and set the Base and Disp based on the address mode.
20358PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
20359 SDValue N, SDValue &Disp,
20360 SDValue &Base,
20361 SelectionDAG &DAG,
20362 MaybeAlign Align) const {
20363 SDLoc DL(Parent);
20364
20365 // Compute the address flags.
20366 unsigned Flags = computeMOFlags(Parent, N, DAG);
20367
20368 // Get the optimal address mode based on the Flags.
20369 PPC::AddrMode Mode = getAddrModeForFlags(Flags);
20370
20371 // If the address mode is DS-Form or DQ-Form, check if the FI is aligned.
20372 // Select an X-Form load if it is not.
20373 setXFormForUnalignedFI(N, Flags, Mode);
20374
20375 // Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.
20376 if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {
20377 assert(Subtarget.isUsingPCRelativeCalls() &&
20378 "Must be using PC-Relative calls when a valid PC-Relative node is "
20379 "present!");
20380 Mode = PPC::AM_PCRel;
20381 }
20382
20383 // Set Base and Disp accordingly depending on the address mode.
20384 switch (Mode) {
20385 case PPC::AM_DForm:
20386 case PPC::AM_DSForm:
20387 case PPC::AM_DQForm: {
20388 // This is a register plus a 16-bit immediate. The base will be the
20389 // register and the displacement will be the immediate unless it
20390 // isn't sufficiently aligned.
20391 if (Flags & PPC::MOF_RPlusSImm16) {
20392 SDValue Op0 = N.getOperand(i: 0);
20393 SDValue Op1 = N.getOperand(i: 1);
20394 int16_t Imm = Op1->getAsZExtVal();
20395 if (!Align || isAligned(Lhs: *Align, SizeInBytes: Imm)) {
20396 Disp = DAG.getSignedTargetConstant(Val: Imm, DL, VT: N.getValueType());
20397 Base = Op0;
20398 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: Op0)) {
20399 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20400 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
20401 }
20402 break;
20403 }
20404 }
20405 // This is a register plus the @lo relocation. The base is the register
20406 // and the displacement is the global address.
20407 else if (Flags & PPC::MOF_RPlusLo) {
20408 Disp = N.getOperand(i: 1).getOperand(i: 0); // The global address.
20409 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
20410 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
20411 Disp.getOpcode() == ISD::TargetConstantPool ||
20412 Disp.getOpcode() == ISD::TargetJumpTable);
20413 Base = N.getOperand(i: 0);
20414 break;
20415 }
20416 // This is a constant address at most 32 bits. The base will be
20417 // zero or load-immediate-shifted and the displacement will be
20418 // the low 16 bits of the address.
20419 else if (Flags & PPC::MOF_AddrIsSImm32) {
20420 auto *CN = cast<ConstantSDNode>(Val&: N);
20421 EVT CNType = CN->getValueType(ResNo: 0);
20422 uint64_t CNImm = CN->getZExtValue();
20423 // If this address fits entirely in a 16-bit sext immediate field, codegen
20424 // this as "d, 0".
20425 int16_t Imm;
20426 if (isIntS16Immediate(N: CN, Imm) && (!Align || isAligned(Lhs: *Align, SizeInBytes: Imm))) {
20427 Disp = DAG.getSignedTargetConstant(Val: Imm, DL, VT: CNType);
20428 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20429 VT: CNType);
20430 break;
20431 }
20432 // Handle 32-bit sext immediate with LIS + Addr mode.
20433 if ((CNType == MVT::i32 || isInt<32>(x: CNImm)) &&
20434 (!Align || isAligned(Lhs: *Align, SizeInBytes: CNImm))) {
20435 int32_t Addr = (int32_t)CNImm;
20436 // Otherwise, break this down into LIS + Disp.
20437 Disp = DAG.getSignedTargetConstant(Val: (int16_t)Addr, DL, VT: MVT::i32);
20438 Base = DAG.getSignedTargetConstant(Val: (Addr - (int16_t)Addr) >> 16, DL,
20439 VT: MVT::i32);
20440 uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
20441 Base = SDValue(DAG.getMachineNode(Opcode: LIS, dl: DL, VT: CNType, Op1: Base), 0);
20442 break;
20443 }
20444 }
20445 // Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.
20446 Disp = DAG.getTargetConstant(Val: 0, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
20447 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N)) {
20448 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20449 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
20450 } else
20451 Base = N;
20452 break;
20453 }
20454 case PPC::AM_PrefixDForm: {
20455 int64_t Imm34 = 0;
20456 unsigned Opcode = N.getOpcode();
20457 if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
20458 (isIntS34Immediate(Op: N.getOperand(i: 1), Imm&: Imm34))) {
20459 // N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
20460 Disp = DAG.getSignedTargetConstant(Val: Imm34, DL, VT: N.getValueType());
20461 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
20462 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20463 else
20464 Base = N.getOperand(i: 0);
20465 } else if (isIntS34Immediate(Op: N, Imm&: Imm34)) {
20466 // The address is a 34-bit signed immediate.
20467 Disp = DAG.getSignedTargetConstant(Val: Imm34, DL, VT: N.getValueType());
20468 Base = DAG.getRegister(Reg: PPC::ZERO8, VT: N.getValueType());
20469 }
20470 break;
20471 }
20472 case PPC::AM_PCRel: {
20473 // When selecting PC-Relative instructions, "Base" is not utilized as
20474 // we select the address as [PC+imm].
20475 Disp = N;
20476 break;
20477 }
20478 case PPC::AM_None:
20479 break;
20480 default: { // By default, X-Form is always available to be selected.
20481 // When a frame index is not aligned, we also match by XForm.
20482 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
20483 Base = FI ? N : N.getOperand(i: 1);
20484 Disp = FI ? DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20485 VT: N.getValueType())
20486 : N.getOperand(i: 0);
20487 break;
20488 }
20489 }
20490 return Mode;
20491}
20492
20493CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
20494 bool Return,
20495 bool IsVarArg) const {
20496 switch (CC) {
20497 case CallingConv::Cold:
20498 return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF);
20499 default:
20500 return CC_PPC64_ELF;
20501 }
20502}
20503
20504bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
20505 return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics();
20506}
20507
20508TargetLowering::AtomicExpansionKind
20509PPCTargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const {
20510 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20511 if (shouldInlineQuadwordAtomics() && Size == 128)
20512 return AtomicExpansionKind::MaskedIntrinsic;
20513
20514 switch (AI->getOperation()) {
20515 case AtomicRMWInst::UIncWrap:
20516 case AtomicRMWInst::UDecWrap:
20517 case AtomicRMWInst::USubCond:
20518 case AtomicRMWInst::USubSat:
20519 return AtomicExpansionKind::CmpXChg;
20520 default:
20521 return TargetLowering::shouldExpandAtomicRMWInIR(RMW: AI);
20522 }
20523
20524 llvm_unreachable("unreachable atomicrmw operation");
20525}
20526
20527TargetLowering::AtomicExpansionKind
20528PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(
20529 const AtomicCmpXchgInst *AI) const {
20530 unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
20531 if (shouldInlineQuadwordAtomics() && Size == 128)
20532 return AtomicExpansionKind::MaskedIntrinsic;
20533 return AtomicExpansionKind::LLSC;
20534}
20535
20536static Intrinsic::ID
20537getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
20538 switch (BinOp) {
20539 default:
20540 llvm_unreachable("Unexpected AtomicRMW BinOp");
20541 case AtomicRMWInst::Xchg:
20542 return Intrinsic::ppc_atomicrmw_xchg_i128;
20543 case AtomicRMWInst::Add:
20544 return Intrinsic::ppc_atomicrmw_add_i128;
20545 case AtomicRMWInst::Sub:
20546 return Intrinsic::ppc_atomicrmw_sub_i128;
20547 case AtomicRMWInst::And:
20548 return Intrinsic::ppc_atomicrmw_and_i128;
20549 case AtomicRMWInst::Or:
20550 return Intrinsic::ppc_atomicrmw_or_i128;
20551 case AtomicRMWInst::Xor:
20552 return Intrinsic::ppc_atomicrmw_xor_i128;
20553 case AtomicRMWInst::Nand:
20554 return Intrinsic::ppc_atomicrmw_nand_i128;
20555 }
20556}
20557
20558Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
20559 IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
20560 Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
20561 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
20562 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20563 Type *ValTy = Incr->getType();
20564 assert(ValTy->getPrimitiveSizeInBits() == 128);
20565 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
20566 Value *IncrLo = Builder.CreateTrunc(V: Incr, DestTy: Int64Ty, Name: "incr_lo");
20567 Value *IncrHi =
20568 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Incr, RHS: 64), DestTy: Int64Ty, Name: "incr_hi");
20569 Value *LoHi = Builder.CreateIntrinsic(
20570 ID: getIntrinsicForAtomicRMWBinOp128(BinOp: AI->getOperation()), Types: {},
20571 Args: {AlignedAddr, IncrLo, IncrHi});
20572 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
20573 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
20574 Lo = Builder.CreateZExt(V: Lo, DestTy: ValTy, Name: "lo64");
20575 Hi = Builder.CreateZExt(V: Hi, DestTy: ValTy, Name: "hi64");
20576 return Builder.CreateOr(
20577 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValTy, V: 64)), Name: "val64");
20578}
20579
20580Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
20581 IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
20582 Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
20583 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
20584 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20585 Type *ValTy = CmpVal->getType();
20586 assert(ValTy->getPrimitiveSizeInBits() == 128);
20587 Function *IntCmpXchg =
20588 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::ppc_cmpxchg_i128);
20589 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
20590 Value *CmpLo = Builder.CreateTrunc(V: CmpVal, DestTy: Int64Ty, Name: "cmp_lo");
20591 Value *CmpHi =
20592 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: CmpVal, RHS: 64), DestTy: Int64Ty, Name: "cmp_hi");
20593 Value *NewLo = Builder.CreateTrunc(V: NewVal, DestTy: Int64Ty, Name: "new_lo");
20594 Value *NewHi =
20595 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: NewVal, RHS: 64), DestTy: Int64Ty, Name: "new_hi");
20596 emitLeadingFence(Builder, Inst: CI, Ord);
20597 Value *LoHi =
20598 Builder.CreateCall(Callee: IntCmpXchg, Args: {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi});
20599 emitTrailingFence(Builder, Inst: CI, Ord);
20600 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
20601 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
20602 Lo = Builder.CreateZExt(V: Lo, DestTy: ValTy, Name: "lo64");
20603 Hi = Builder.CreateZExt(V: Hi, DestTy: ValTy, Name: "hi64");
20604 return Builder.CreateOr(
20605 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValTy, V: 64)), Name: "val64");
20606}
20607
20608bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
20609 return Subtarget.useCRBits();
20610}
20611