1//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the PPCISelLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "PPCISelLowering.h"
14#include "MCTargetDesc/PPCMCTargetDesc.h"
15#include "MCTargetDesc/PPCPredicates.h"
16#include "PPC.h"
17#include "PPCCallingConv.h"
18#include "PPCFrameLowering.h"
19#include "PPCInstrInfo.h"
20#include "PPCMachineFunctionInfo.h"
21#include "PPCPerfectShuffle.h"
22#include "PPCRegisterInfo.h"
23#include "PPCSelectionDAGInfo.h"
24#include "PPCSubtarget.h"
25#include "PPCTargetMachine.h"
26#include "llvm/ADT/APFloat.h"
27#include "llvm/ADT/APInt.h"
28#include "llvm/ADT/APSInt.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SmallPtrSet.h"
33#include "llvm/ADT/SmallVector.h"
34#include "llvm/ADT/Statistic.h"
35#include "llvm/ADT/StringRef.h"
36#include "llvm/CodeGen/CallingConvLower.h"
37#include "llvm/CodeGen/ISDOpcodes.h"
38#include "llvm/CodeGen/LivePhysRegs.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
40#include "llvm/CodeGen/MachineFrameInfo.h"
41#include "llvm/CodeGen/MachineFunction.h"
42#include "llvm/CodeGen/MachineInstr.h"
43#include "llvm/CodeGen/MachineInstrBuilder.h"
44#include "llvm/CodeGen/MachineJumpTableInfo.h"
45#include "llvm/CodeGen/MachineLoopInfo.h"
46#include "llvm/CodeGen/MachineMemOperand.h"
47#include "llvm/CodeGen/MachineModuleInfo.h"
48#include "llvm/CodeGen/MachineOperand.h"
49#include "llvm/CodeGen/MachineRegisterInfo.h"
50#include "llvm/CodeGen/SelectionDAG.h"
51#include "llvm/CodeGen/SelectionDAGNodes.h"
52#include "llvm/CodeGen/TargetInstrInfo.h"
53#include "llvm/CodeGen/TargetLowering.h"
54#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
55#include "llvm/CodeGen/TargetRegisterInfo.h"
56#include "llvm/CodeGen/ValueTypes.h"
57#include "llvm/CodeGenTypes/MachineValueType.h"
58#include "llvm/IR/CallingConv.h"
59#include "llvm/IR/Constant.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GlobalValue.h"
66#include "llvm/IR/IRBuilder.h"
67#include "llvm/IR/Instructions.h"
68#include "llvm/IR/Intrinsics.h"
69#include "llvm/IR/IntrinsicsPowerPC.h"
70#include "llvm/IR/Module.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/Value.h"
74#include "llvm/MC/MCContext.h"
75#include "llvm/MC/MCExpr.h"
76#include "llvm/MC/MCSectionXCOFF.h"
77#include "llvm/MC/MCSymbolXCOFF.h"
78#include "llvm/Support/AtomicOrdering.h"
79#include "llvm/Support/BranchProbability.h"
80#include "llvm/Support/Casting.h"
81#include "llvm/Support/CodeGen.h"
82#include "llvm/Support/CommandLine.h"
83#include "llvm/Support/Compiler.h"
84#include "llvm/Support/Debug.h"
85#include "llvm/Support/ErrorHandling.h"
86#include "llvm/Support/Format.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Target/TargetMachine.h"
91#include "llvm/Target/TargetOptions.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <list>
97#include <optional>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "ppc-lowering"
104
105static cl::opt<bool> DisableP10StoreForward(
106 "disable-p10-store-forward",
107 cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
108 cl::init(Val: false));
109
110static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
111cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
112
113static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
114cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
115
116static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
117cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
118
119static cl::opt<bool> DisableSCO("disable-ppc-sco",
120cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
121
122static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
123cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
124
125static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
126cl::desc("use absolute jump tables on ppc"), cl::Hidden);
127
128static cl::opt<bool>
129 DisablePerfectShuffle("ppc-disable-perfect-shuffle",
130 cl::desc("disable vector permute decomposition"),
131 cl::init(Val: true), cl::Hidden);
132
133cl::opt<bool> DisableAutoPairedVecSt(
134 "disable-auto-paired-vec-st",
135 cl::desc("disable automatically generated 32byte paired vector stores"),
136 cl::init(Val: true), cl::Hidden);
137
138static cl::opt<unsigned> PPCMinimumJumpTableEntries(
139 "ppc-min-jump-table-entries", cl::init(Val: 64), cl::Hidden,
140 cl::desc("Set minimum number of entries to use a jump table on PPC"));
141
142static cl::opt<unsigned> PPCMinimumBitTestCmps(
143 "ppc-min-bit-test-cmps", cl::init(Val: 3), cl::Hidden,
144 cl::desc("Set minimum of largest number of comparisons to use bit test for "
145 "switch on PPC."));
146
147static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
148 "ppc-gather-alias-max-depth", cl::init(Val: 18), cl::Hidden,
149 cl::desc("max depth when checking alias info in GatherAllAliases()"));
150
151static cl::opt<unsigned> PPCAIXTLSModelOptUseIEForLDLimit(
152 "ppc-aix-shared-lib-tls-model-opt-limit", cl::init(Val: 1), cl::Hidden,
153 cl::desc("Set inclusive limit count of TLS local-dynamic access(es) in a "
154 "function to use initial-exec"));
155
156STATISTIC(NumTailCalls, "Number of tail calls");
157STATISTIC(NumSiblingCalls, "Number of sibling calls");
158STATISTIC(ShufflesHandledWithVPERM,
159 "Number of shuffles lowered to a VPERM or XXPERM");
160STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
161
162static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
163
164static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
165
166// A faster local-[exec|dynamic] TLS access sequence (enabled with the
167// -maix-small-local-[exec|dynamic]-tls option) can be produced for TLS
168// variables; consistent with the IBM XL compiler, we apply a max size of
169// slightly under 32KB.
170constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;
171
172// FIXME: Remove this once the bug has been fixed!
173extern cl::opt<bool> ANDIGlueBug;
174
175PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
176 const PPCSubtarget &STI)
177 : TargetLowering(TM, STI), Subtarget(STI) {
178 // Initialize map that relates the PPC addressing modes to the computed flags
179 // of a load/store instruction. The map is used to determine the optimal
180 // addressing mode when selecting load and stores.
181 initializeAddrModeMap();
182 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
183 // arguments are at least 4/8 bytes aligned.
184 bool isPPC64 = Subtarget.isPPC64();
185 setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
186 const MVT RegVT = Subtarget.getScalarIntVT();
187
188 // Set up the register classes.
189 addRegisterClass(VT: MVT::i32, RC: &PPC::GPRCRegClass);
190 if (!useSoftFloat()) {
191 if (hasSPE()) {
192 addRegisterClass(VT: MVT::f32, RC: &PPC::GPRCRegClass);
193 // EFPU2 APU only supports f32
194 if (!Subtarget.hasEFPU2())
195 addRegisterClass(VT: MVT::f64, RC: &PPC::SPERCRegClass);
196 } else {
197 addRegisterClass(VT: MVT::f32, RC: &PPC::F4RCRegClass);
198 addRegisterClass(VT: MVT::f64, RC: &PPC::F8RCRegClass);
199 }
200 }
201
202 setOperationAction(Op: ISD::UADDO, VT: RegVT, Action: Custom);
203 setOperationAction(Op: ISD::USUBO, VT: RegVT, Action: Custom);
204
205 // PowerPC uses addo_carry,subo_carry to propagate carry.
206 setOperationAction(Op: ISD::UADDO_CARRY, VT: RegVT, Action: Custom);
207 setOperationAction(Op: ISD::USUBO_CARRY, VT: RegVT, Action: Custom);
208
209 // On P10, the default lowering generates better code using the
210 // setbc instruction.
211 if (!Subtarget.hasP10Vector()) {
212 setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
213 setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
214 if (isPPC64) {
215 setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
216 setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
217 }
218 }
219
220 // Match BITREVERSE to customized fast code sequence in the td file.
221 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
222 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
223
224 // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
225 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: Custom);
226
227 // Custom lower inline assembly to check for special registers.
228 setOperationAction(Op: ISD::INLINEASM, VT: MVT::Other, Action: Custom);
229 setOperationAction(Op: ISD::INLINEASM_BR, VT: MVT::Other, Action: Custom);
230
231 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
232 for (MVT VT : MVT::integer_valuetypes()) {
233 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
234 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i8, Action: Expand);
235 }
236
237 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f16, Action: Expand);
238 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f128, Action: Expand);
239
240 if (Subtarget.isISA3_0()) {
241 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: MVT::f16, Action: Legal);
242 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Legal);
243 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Legal);
244 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Legal);
245 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Legal);
246 } else {
247 // No extending loads from f16 or HW conversions back and forth.
248 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: MVT::f16, Action: Expand);
249 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f128, Action: Expand);
250 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
251 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f64, Action: Expand);
252 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f64, Action: Expand);
253 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
254 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f32, Action: Expand);
255 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::f32, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
258 }
259
260 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
261
262 // PowerPC has pre-inc load and store's.
263 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i1, Action: Legal);
264 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i8, Action: Legal);
265 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i16, Action: Legal);
266 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i32, Action: Legal);
267 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::i64, Action: Legal);
268 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i1, Action: Legal);
269 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i8, Action: Legal);
270 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i16, Action: Legal);
271 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i32, Action: Legal);
272 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::i64, Action: Legal);
273 if (!Subtarget.hasSPE()) {
274 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::f32, Action: Legal);
275 setIndexedLoadAction(IdxModes: ISD::PRE_INC, VT: MVT::f64, Action: Legal);
276 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::f32, Action: Legal);
277 setIndexedStoreAction(IdxModes: ISD::PRE_INC, VT: MVT::f64, Action: Legal);
278 }
279
280 if (Subtarget.useCRBits()) {
281 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand);
282
283 if (isPPC64 || Subtarget.hasFPCVT()) {
284 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i1, Action: Promote);
285 AddPromotedToType(Opc: ISD::STRICT_SINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
286 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i1, Action: Promote);
287 AddPromotedToType(Opc: ISD::STRICT_UINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
288
289 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i1, Action: Promote);
290 AddPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
291 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i1, Action: Promote);
292 AddPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::i1, DestVT: RegVT);
293
294 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i1, Action: Promote);
295 AddPromotedToType(Opc: ISD::STRICT_FP_TO_SINT, OrigVT: MVT::i1, DestVT: RegVT);
296 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i1, Action: Promote);
297 AddPromotedToType(Opc: ISD::STRICT_FP_TO_UINT, OrigVT: MVT::i1, DestVT: RegVT);
298
299 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i1, Action: Promote);
300 AddPromotedToType(Opc: ISD::FP_TO_SINT, OrigVT: MVT::i1, DestVT: RegVT);
301 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i1, Action: Promote);
302 AddPromotedToType(Opc: ISD::FP_TO_UINT, OrigVT: MVT::i1, DestVT: RegVT);
303 } else {
304 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i1, Action: Custom);
305 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i1, Action: Custom);
306 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i1, Action: Custom);
307 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i1, Action: Custom);
308 }
309
310 // PowerPC does not support direct load/store of condition registers.
311 setOperationAction(Op: ISD::LOAD, VT: MVT::i1, Action: Custom);
312 setOperationAction(Op: ISD::STORE, VT: MVT::i1, Action: Custom);
313
314 // FIXME: Remove this once the ANDI glue bug is fixed:
315 if (ANDIGlueBug)
316 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::i1, Action: Custom);
317
318 for (MVT VT : MVT::integer_valuetypes()) {
319 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
320 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote);
321 setTruncStoreAction(ValVT: VT, MemVT: MVT::i1, Action: Expand);
322 }
323
324 addRegisterClass(VT: MVT::i1, RC: &PPC::CRBITRCRegClass);
325 }
326
327 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
328 // PPC (the libcall is not available).
329 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::ppcf128, Action: Custom);
330 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::ppcf128, Action: Custom);
331 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::ppcf128, Action: Custom);
332 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::ppcf128, Action: Custom);
333
334 // We do not currently implement these libm ops for PowerPC.
335 setOperationAction(Op: ISD::FFLOOR, VT: MVT::ppcf128, Action: Expand);
336 setOperationAction(Op: ISD::FCEIL, VT: MVT::ppcf128, Action: Expand);
337 setOperationAction(Op: ISD::FTRUNC, VT: MVT::ppcf128, Action: Expand);
338 setOperationAction(Op: ISD::FRINT, VT: MVT::ppcf128, Action: Expand);
339 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::ppcf128, Action: Expand);
340 setOperationAction(Op: ISD::FREM, VT: MVT::ppcf128, Action: LibCall);
341
342 // PowerPC has no SREM/UREM instructions unless we are on P9
343 // On P9 we may use a hardware instruction to compute the remainder.
344 // When the result of both the remainder and the division is required it is
345 // more efficient to compute the remainder from the result of the division
346 // rather than use the remainder instruction. The instructions are legalized
347 // directly because the DivRemPairsPass performs the transformation at the IR
348 // level.
349 if (Subtarget.isISA3_0()) {
350 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Legal);
351 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Legal);
352 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Legal);
353 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Legal);
354 } else {
355 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
356 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
357 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
358 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
359 }
360
361 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
362 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
363 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
364 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
365 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
366 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
367 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
368 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
369 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
370
371 // Handle constrained floating-point operations of scalar.
372 // TODO: Handle SPE specific operation.
373 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f32, Action: Legal);
374 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f32, Action: Legal);
375 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f32, Action: Legal);
376 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f32, Action: Legal);
377 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Legal);
378
379 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f64, Action: Legal);
380 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f64, Action: Legal);
381 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f64, Action: Legal);
382 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f64, Action: Legal);
383
384 if (!Subtarget.hasSPE()) {
385 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f32, Action: Legal);
386 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f64, Action: Legal);
387 }
388
389 if (Subtarget.hasVSX()) {
390 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f32, Action: Legal);
391 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f64, Action: Legal);
392 }
393
394 if (Subtarget.hasFSQRT()) {
395 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f32, Action: Legal);
396 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f64, Action: Legal);
397 }
398
399 if (Subtarget.hasFPRND()) {
400 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f32, Action: Legal);
401 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f32, Action: Legal);
402 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f32, Action: Legal);
403 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f32, Action: Legal);
404
405 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f64, Action: Legal);
406 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f64, Action: Legal);
407 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f64, Action: Legal);
408 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f64, Action: Legal);
409 }
410
411 // We don't support sin/cos/sqrt/fmod/pow
412 setOperationAction(Op: ISD::FSIN , VT: MVT::f64, Action: Expand);
413 setOperationAction(Op: ISD::FCOS , VT: MVT::f64, Action: Expand);
414 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
415 setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: LibCall);
416 setOperationAction(Op: ISD::FPOW , VT: MVT::f64, Action: Expand);
417 setOperationAction(Op: ISD::FSIN , VT: MVT::f32, Action: Expand);
418 setOperationAction(Op: ISD::FCOS , VT: MVT::f32, Action: Expand);
419 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
420 setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: LibCall);
421 setOperationAction(Op: ISD::FPOW , VT: MVT::f32, Action: Expand);
422
423 // MASS transformation for LLVM intrinsics with replicating fast-math flag
424 // to be consistent to PPCGenScalarMASSEntries pass
425 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) {
426 setOperationAction(Op: ISD::FSIN , VT: MVT::f64, Action: Custom);
427 setOperationAction(Op: ISD::FCOS , VT: MVT::f64, Action: Custom);
428 setOperationAction(Op: ISD::FPOW , VT: MVT::f64, Action: Custom);
429 setOperationAction(Op: ISD::FLOG, VT: MVT::f64, Action: Custom);
430 setOperationAction(Op: ISD::FLOG10, VT: MVT::f64, Action: Custom);
431 setOperationAction(Op: ISD::FEXP, VT: MVT::f64, Action: Custom);
432 setOperationAction(Op: ISD::FSIN , VT: MVT::f32, Action: Custom);
433 setOperationAction(Op: ISD::FCOS , VT: MVT::f32, Action: Custom);
434 setOperationAction(Op: ISD::FPOW , VT: MVT::f32, Action: Custom);
435 setOperationAction(Op: ISD::FLOG, VT: MVT::f32, Action: Custom);
436 setOperationAction(Op: ISD::FLOG10, VT: MVT::f32, Action: Custom);
437 setOperationAction(Op: ISD::FEXP, VT: MVT::f32, Action: Custom);
438 }
439
440 if (Subtarget.hasSPE()) {
441 setOperationAction(Op: ISD::FMA , VT: MVT::f64, Action: Expand);
442 setOperationAction(Op: ISD::FMA , VT: MVT::f32, Action: Expand);
443 } else {
444 setOperationAction(Op: ISD::FMA , VT: MVT::f64, Action: Legal);
445 setOperationAction(Op: ISD::FMA , VT: MVT::f32, Action: Legal);
446 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
447 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
448 }
449
450 if (Subtarget.hasSPE())
451 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
452
453 // If we're enabling GP optimizations, use hardware square root
454 if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
455 setOperationAction(Op: ISD::FSQRT, VT: MVT::f64, Action: Expand);
456
457 if (!Subtarget.hasFSQRT() &&
458 !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
459 setOperationAction(Op: ISD::FSQRT, VT: MVT::f32, Action: Expand);
460
461 if (Subtarget.hasFCPSGN()) {
462 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Legal);
463 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Legal);
464 } else {
465 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Expand);
466 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Expand);
467 }
468
469 if (Subtarget.hasFPRND()) {
470 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
471 setOperationAction(Op: ISD::FCEIL, VT: MVT::f64, Action: Legal);
472 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f64, Action: Legal);
473 setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Legal);
474
475 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f32, Action: Legal);
476 setOperationAction(Op: ISD::FCEIL, VT: MVT::f32, Action: Legal);
477 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f32, Action: Legal);
478 setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Legal);
479 }
480
481 // Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP
482 // instruction xxbrd to speed up scalar BSWAP64.
483 if (Subtarget.isISA3_1()) {
484 setOperationAction(Op: ISD::BSWAP, VT: MVT::i32, Action: Legal);
485 setOperationAction(Op: ISD::BSWAP, VT: MVT::i64, Action: Legal);
486 } else {
487 setOperationAction(Op: ISD::BSWAP, VT: MVT::i32, Action: Expand);
488 setOperationAction(Op: ISD::BSWAP, VT: MVT::i64,
489 Action: (Subtarget.hasP9Vector() && isPPC64) ? Custom : Expand);
490 }
491
492 // CTPOP or CTTZ were introduced in P8/P9 respectively
493 if (Subtarget.isISA3_0()) {
494 setOperationAction(Op: ISD::CTTZ , VT: MVT::i32 , Action: Legal);
495 setOperationAction(Op: ISD::CTTZ , VT: MVT::i64 , Action: Legal);
496 } else {
497 setOperationAction(Op: ISD::CTTZ , VT: MVT::i32 , Action: Expand);
498 setOperationAction(Op: ISD::CTTZ , VT: MVT::i64 , Action: Expand);
499 }
500
501 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
502 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32 , Action: Legal);
503 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64 , Action: Legal);
504 } else {
505 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32 , Action: Expand);
506 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64 , Action: Expand);
507 }
508
509 // PowerPC does not have ROTR
510 setOperationAction(Op: ISD::ROTR, VT: MVT::i32 , Action: Expand);
511 setOperationAction(Op: ISD::ROTR, VT: MVT::i64 , Action: Expand);
512
513 if (!Subtarget.useCRBits()) {
514 // PowerPC does not have Select
515 setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Expand);
516 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Expand);
517 setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Expand);
518 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Expand);
519 }
520
521 // PowerPC wants to turn select_cc of FP into fsel when possible.
522 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
523 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
524
525 // PowerPC wants to optimize integer setcc a bit
526 if (!Subtarget.useCRBits())
527 setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
528
529 if (Subtarget.hasFPU()) {
530 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Legal);
531 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Legal);
532 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Legal);
533
534 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Legal);
535 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Legal);
536 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Legal);
537 }
538
539 // PowerPC does not have BRCOND which requires SetCC
540 if (!Subtarget.useCRBits())
541 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Expand);
542
543 setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Expand);
544
545 if (Subtarget.hasSPE()) {
546 // SPE has built-in conversions
547 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Legal);
548 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Legal);
549 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Legal);
550 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Legal);
551 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Legal);
552 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Legal);
553
554 // SPE supports signaling compare of f32/f64.
555 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Legal);
556 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Legal);
557 } else {
558 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
559 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
560 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
561
562 // PowerPC does not have [U|S]INT_TO_FP
563 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Expand);
564 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Expand);
565 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Expand);
566 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Expand);
567 }
568
569 if (Subtarget.hasDirectMove() && isPPC64) {
570 setOperationAction(Op: ISD::BITCAST, VT: MVT::f32, Action: Legal);
571 setOperationAction(Op: ISD::BITCAST, VT: MVT::i32, Action: Legal);
572 setOperationAction(Op: ISD::BITCAST, VT: MVT::i64, Action: Legal);
573 setOperationAction(Op: ISD::BITCAST, VT: MVT::f64, Action: Legal);
574
575 setOperationAction(Op: ISD::STRICT_LRINT, VT: MVT::f64, Action: Custom);
576 setOperationAction(Op: ISD::STRICT_LRINT, VT: MVT::f32, Action: Custom);
577 setOperationAction(Op: ISD::STRICT_LLRINT, VT: MVT::f64, Action: Custom);
578 setOperationAction(Op: ISD::STRICT_LLRINT, VT: MVT::f32, Action: Custom);
579 setOperationAction(Op: ISD::STRICT_LROUND, VT: MVT::f64, Action: Custom);
580 setOperationAction(Op: ISD::STRICT_LROUND, VT: MVT::f32, Action: Custom);
581 setOperationAction(Op: ISD::STRICT_LLROUND, VT: MVT::f64, Action: Custom);
582 setOperationAction(Op: ISD::STRICT_LLROUND, VT: MVT::f32, Action: Custom);
583 } else {
584 setOperationAction(Op: ISD::BITCAST, VT: MVT::f32, Action: Expand);
585 setOperationAction(Op: ISD::BITCAST, VT: MVT::i32, Action: Expand);
586 setOperationAction(Op: ISD::BITCAST, VT: MVT::i64, Action: Expand);
587 setOperationAction(Op: ISD::BITCAST, VT: MVT::f64, Action: Expand);
588 }
589
590 // We cannot sextinreg(i1). Expand to shifts.
591 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand);
592
593 // Custom handling for PowerPC ucmp instruction
594 setOperationAction(Op: ISD::UCMP, VT: MVT::i32, Action: Custom);
595 setOperationAction(Op: ISD::UCMP, VT: MVT::i64, Action: isPPC64 ? Custom : Expand);
596
597 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
598 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
599 // support continuation, user-level threading, and etc.. As a result, no
600 // other SjLj exception interfaces are implemented and please don't build
601 // your own exception handling based on them.
602 // LLVM/Clang supports zero-cost DWARF exception handling.
603 setOperationAction(Op: ISD::EH_SJLJ_SETJMP, VT: MVT::i32, Action: Custom);
604 setOperationAction(Op: ISD::EH_SJLJ_LONGJMP, VT: MVT::Other, Action: Custom);
605
606 // We want to legalize GlobalAddress and ConstantPool nodes into the
607 // appropriate instructions to materialize the address.
608 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i32, Action: Custom);
609 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i32, Action: Custom);
610 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i32, Action: Custom);
611 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i32, Action: Custom);
612 setOperationAction(Op: ISD::JumpTable, VT: MVT::i32, Action: Custom);
613 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
614 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
615 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
616 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
617 setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
618
619 // TRAP is legal.
620 setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
621
622 // TRAMPOLINE is custom lowered.
623 setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
624 setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
625
626 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
627 setOperationAction(Op: ISD::VASTART , VT: MVT::Other, Action: Custom);
628
629 if (Subtarget.is64BitELFABI()) {
630 // VAARG always uses double-word chunks, so promote anything smaller.
631 setOperationAction(Op: ISD::VAARG, VT: MVT::i1, Action: Promote);
632 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i1, DestVT: MVT::i64);
633 setOperationAction(Op: ISD::VAARG, VT: MVT::i8, Action: Promote);
634 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i8, DestVT: MVT::i64);
635 setOperationAction(Op: ISD::VAARG, VT: MVT::i16, Action: Promote);
636 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i16, DestVT: MVT::i64);
637 setOperationAction(Op: ISD::VAARG, VT: MVT::i32, Action: Promote);
638 AddPromotedToType(Opc: ISD::VAARG, OrigVT: MVT::i32, DestVT: MVT::i64);
639 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Expand);
640 } else if (Subtarget.is32BitELFABI()) {
641 // VAARG is custom lowered with the 32-bit SVR4 ABI.
642 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
643 setOperationAction(Op: ISD::VAARG, VT: MVT::i64, Action: Custom);
644 } else
645 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Expand);
646
647 // VACOPY is custom lowered with the 32-bit SVR4 ABI.
648 if (Subtarget.is32BitELFABI())
649 setOperationAction(Op: ISD::VACOPY , VT: MVT::Other, Action: Custom);
650 else
651 setOperationAction(Op: ISD::VACOPY , VT: MVT::Other, Action: Expand);
652
653 // Use the default implementation.
654 setOperationAction(Op: ISD::VAEND , VT: MVT::Other, Action: Expand);
655 setOperationAction(Op: ISD::STACKSAVE , VT: MVT::Other, Action: Expand);
656 setOperationAction(Op: ISD::STACKRESTORE , VT: MVT::Other, Action: Custom);
657 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32 , Action: Custom);
658 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64 , Action: Custom);
659 setOperationAction(Op: ISD::GET_DYNAMIC_AREA_OFFSET, VT: MVT::i32, Action: Custom);
660 setOperationAction(Op: ISD::GET_DYNAMIC_AREA_OFFSET, VT: MVT::i64, Action: Custom);
661 setOperationAction(Op: ISD::EH_DWARF_CFA, VT: MVT::i32, Action: Custom);
662 setOperationAction(Op: ISD::EH_DWARF_CFA, VT: MVT::i64, Action: Custom);
663
664 if (Subtarget.isISA3_0() && isPPC64) {
665 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v16i1, Action: Custom);
666 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v8i1, Action: Custom);
667 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v4i1, Action: Custom);
668 setOperationAction(Op: ISD::VP_STORE, VT: MVT::v2i1, Action: Custom);
669 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v16i1, Action: Custom);
670 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v8i1, Action: Custom);
671 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v4i1, Action: Custom);
672 setOperationAction(Op: ISD::VP_LOAD, VT: MVT::v2i1, Action: Custom);
673 }
674
675 // We want to custom lower some of our intrinsics.
676 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
677 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::f64, Action: Custom);
678 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::ppcf128, Action: Custom);
679 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::v4f32, Action: Custom);
680 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::v2f64, Action: Custom);
681
682 // To handle counter-based loop conditions.
683 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i1, Action: Custom);
684 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
685
686 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i8, Action: Custom);
687 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i16, Action: Custom);
688 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i32, Action: Custom);
689 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
690
691 // Comparisons that require checking two conditions.
692 if (Subtarget.hasSPE()) {
693 setCondCodeAction(CCs: ISD::SETO, VT: MVT::f32, Action: Expand);
694 setCondCodeAction(CCs: ISD::SETO, VT: MVT::f64, Action: Expand);
695 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::f32, Action: Expand);
696 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::f64, Action: Expand);
697 }
698 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f32, Action: Expand);
699 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f64, Action: Expand);
700 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f32, Action: Expand);
701 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f64, Action: Expand);
702 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f32, Action: Expand);
703 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f64, Action: Expand);
704 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f32, Action: Expand);
705 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f64, Action: Expand);
706 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f32, Action: Expand);
707 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f64, Action: Expand);
708 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f32, Action: Expand);
709 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f64, Action: Expand);
710
711 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f32, Action: Legal);
712 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f64, Action: Legal);
713
714 if (Subtarget.has64BitSupport()) {
715 // They also have instructions for converting between i64 and fp.
716 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
717 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Expand);
718 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
719 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Expand);
720 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
721 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Expand);
722 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
723 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Expand);
724 // This is just the low 32 bits of a (signed) fp->i64 conversion.
725 // We cannot do this with Promote because i64 is not a legal type.
726 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
727 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
728
729 if (Subtarget.hasLFIWAX() || isPPC64) {
730 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
731 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
732 }
733 } else {
734 // PowerPC does not have FP_TO_UINT on 32-bit implementations.
735 if (Subtarget.hasSPE()) {
736 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Legal);
737 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Legal);
738 } else {
739 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Expand);
740 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Expand);
741 }
742 }
743
744 // With the instructions enabled under FPCVT, we can do everything.
745 if (Subtarget.hasFPCVT()) {
746 if (Subtarget.has64BitSupport()) {
747 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
748 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
749 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
750 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
751 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
752 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
753 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
754 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
755 }
756
757 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
758 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
759 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
760 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
761 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
762 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
763 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
764 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
765 }
766
767 if (Subtarget.use64BitRegs()) {
768 // 64-bit PowerPC implementations can support i64 types directly
769 addRegisterClass(VT: MVT::i64, RC: &PPC::G8RCRegClass);
770 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
771 setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
772 // 64-bit PowerPC wants to expand i128 shifts itself.
773 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
774 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
775 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
776 } else {
777 // 32-bit PowerPC wants to expand i64 shifts itself.
778 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i32, Action: Custom);
779 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i32, Action: Custom);
780 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i32, Action: Custom);
781 }
782
783 // PowerPC has better expansions for funnel shifts than the generic
784 // TargetLowering::expandFunnelShift.
785 if (Subtarget.has64BitSupport()) {
786 setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
787 setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
788 }
789 setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
790 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
791
792 if (Subtarget.hasVSX()) {
793 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT: MVT::f64, Action: Legal);
794 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT: MVT::f32, Action: Legal);
795 setOperationAction(Op: ISD::FMINNUM_IEEE, VT: MVT::f64, Action: Legal);
796 setOperationAction(Op: ISD::FMINNUM_IEEE, VT: MVT::f32, Action: Legal);
797 setOperationAction(Op: ISD::FMAXNUM, VT: MVT::f64, Action: Legal);
798 setOperationAction(Op: ISD::FMAXNUM, VT: MVT::f32, Action: Legal);
799 setOperationAction(Op: ISD::FMINNUM, VT: MVT::f64, Action: Legal);
800 setOperationAction(Op: ISD::FMINNUM, VT: MVT::f32, Action: Legal);
801 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f64, Action: Legal);
802 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f32, Action: Legal);
803 }
804
805 if (Subtarget.hasAltivec()) {
806 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
807 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
808 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
809 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
810 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
811 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
812 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
813 }
814 // First set operation action for all vector types to expand. Then we
815 // will selectively turn on ones that can be effectively codegen'd.
816 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
817 // add/sub are legal for all supported vector VT's.
818 setOperationAction(Op: ISD::ADD, VT, Action: Legal);
819 setOperationAction(Op: ISD::SUB, VT, Action: Legal);
820
821 // For v2i64, these are only valid with P8Vector. This is corrected after
822 // the loop.
823 if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
824 setOperationAction(Op: ISD::SMAX, VT, Action: Legal);
825 setOperationAction(Op: ISD::SMIN, VT, Action: Legal);
826 setOperationAction(Op: ISD::UMAX, VT, Action: Legal);
827 setOperationAction(Op: ISD::UMIN, VT, Action: Legal);
828 }
829 else {
830 setOperationAction(Op: ISD::SMAX, VT, Action: Expand);
831 setOperationAction(Op: ISD::SMIN, VT, Action: Expand);
832 setOperationAction(Op: ISD::UMAX, VT, Action: Expand);
833 setOperationAction(Op: ISD::UMIN, VT, Action: Expand);
834 }
835
836 if (Subtarget.hasVSX()) {
837 setOperationAction(Op: ISD::FMAXNUM_IEEE, VT, Action: Legal);
838 setOperationAction(Op: ISD::FMINNUM_IEEE, VT, Action: Legal);
839 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Legal);
840 setOperationAction(Op: ISD::FMINNUM, VT, Action: Legal);
841 setOperationAction(Op: ISD::FCANONICALIZE, VT, Action: Legal);
842 }
843
844 // Vector instructions introduced in P8
845 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
846 setOperationAction(Op: ISD::CTPOP, VT, Action: Legal);
847 setOperationAction(Op: ISD::CTLZ, VT, Action: Legal);
848 }
849 else {
850 setOperationAction(Op: ISD::CTPOP, VT, Action: Expand);
851 setOperationAction(Op: ISD::CTLZ, VT, Action: Expand);
852 }
853
854 // Vector instructions introduced in P9
855 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
856 setOperationAction(Op: ISD::CTTZ, VT, Action: Legal);
857 else
858 setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
859
860 // We promote all shuffles to v16i8.
861 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Promote);
862 AddPromotedToType (Opc: ISD::VECTOR_SHUFFLE, OrigVT: VT, DestVT: MVT::v16i8);
863
864 // We promote all non-typed operations to v4i32.
865 setOperationAction(Op: ISD::AND , VT, Action: Promote);
866 AddPromotedToType (Opc: ISD::AND , OrigVT: VT, DestVT: MVT::v4i32);
867 setOperationAction(Op: ISD::OR , VT, Action: Promote);
868 AddPromotedToType (Opc: ISD::OR , OrigVT: VT, DestVT: MVT::v4i32);
869 setOperationAction(Op: ISD::XOR , VT, Action: Promote);
870 AddPromotedToType (Opc: ISD::XOR , OrigVT: VT, DestVT: MVT::v4i32);
871 setOperationAction(Op: ISD::LOAD , VT, Action: Promote);
872 AddPromotedToType (Opc: ISD::LOAD , OrigVT: VT, DestVT: MVT::v4i32);
873 setOperationAction(Op: ISD::SELECT, VT, Action: Promote);
874 AddPromotedToType (Opc: ISD::SELECT, OrigVT: VT, DestVT: MVT::v4i32);
875 setOperationAction(Op: ISD::VSELECT, VT, Action: Legal);
876 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Promote);
877 AddPromotedToType (Opc: ISD::SELECT_CC, OrigVT: VT, DestVT: MVT::v4i32);
878 setOperationAction(Op: ISD::STORE, VT, Action: Promote);
879 AddPromotedToType (Opc: ISD::STORE, OrigVT: VT, DestVT: MVT::v4i32);
880
881 // No other operations are legal.
882 setOperationAction(Op: ISD::MUL , VT, Action: Expand);
883 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
884 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
885 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
886 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
887 setOperationAction(Op: ISD::FDIV, VT, Action: Expand);
888 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
889 setOperationAction(Op: ISD::FNEG, VT, Action: Expand);
890 setOperationAction(Op: ISD::FSQRT, VT, Action: Expand);
891 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
892 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
893 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
894 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
895 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
896 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
897 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
898 setOperationAction(Op: ISD::FABS, VT, Action: Expand);
899 setOperationAction(Op: ISD::FFLOOR, VT, Action: Expand);
900 setOperationAction(Op: ISD::FCEIL, VT, Action: Expand);
901 setOperationAction(Op: ISD::FTRUNC, VT, Action: Expand);
902 setOperationAction(Op: ISD::FRINT, VT, Action: Expand);
903 setOperationAction(Op: ISD::FLDEXP, VT, Action: Expand);
904 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Expand);
905 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Expand);
906 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Expand);
907 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Expand);
908 setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
909 setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
910 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
911 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
912 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
913 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
914 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: Expand);
915 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
916 setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
917 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
918 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
919 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
920
921 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
922 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
923 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
924 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
925 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
926 }
927 }
928 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::v4i32, Action: Expand);
929 if (!Subtarget.hasP8Vector()) {
930 setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Expand);
931 setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Expand);
932 setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Expand);
933 setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Expand);
934 }
935
936 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
937 // with merges, splats, etc.
938 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v16i8, Action: Custom);
939
940 // Vector truncates to sub-word integer that fit in an Altivec/VSX register
941 // are cheap, so handle them before they get expanded to scalar.
942 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v8i8, Action: Custom);
943 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v4i8, Action: Custom);
944 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v2i8, Action: Custom);
945 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v4i16, Action: Custom);
946 setOperationAction(Op: ISD::TRUNCATE, VT: MVT::v2i16, Action: Custom);
947
948 setOperationAction(Op: ISD::AND , VT: MVT::v4i32, Action: Legal);
949 setOperationAction(Op: ISD::OR , VT: MVT::v4i32, Action: Legal);
950 setOperationAction(Op: ISD::XOR , VT: MVT::v4i32, Action: Legal);
951 setOperationAction(Op: ISD::LOAD , VT: MVT::v4i32, Action: Legal);
952 setOperationAction(Op: ISD::SELECT, VT: MVT::v4i32,
953 Action: Subtarget.useCRBits() ? Legal : Expand);
954 setOperationAction(Op: ISD::STORE , VT: MVT::v4i32, Action: Legal);
955 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::v4i32, Action: Legal);
956 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::v4i32, Action: Legal);
957 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i32, Action: Legal);
958 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i32, Action: Legal);
959 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::v4i32, Action: Legal);
960 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::v4i32, Action: Legal);
961 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i32, Action: Legal);
962 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i32, Action: Legal);
963 setOperationAction(Op: ISD::FFLOOR, VT: MVT::v4f32, Action: Legal);
964 setOperationAction(Op: ISD::FCEIL, VT: MVT::v4f32, Action: Legal);
965 setOperationAction(Op: ISD::FTRUNC, VT: MVT::v4f32, Action: Legal);
966 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::v4f32, Action: Legal);
967
968 // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
969 setOperationAction(Op: ISD::ROTL, VT: MVT::v1i128, Action: Custom);
970 // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
971 if (Subtarget.hasAltivec())
972 for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
973 setOperationAction(Op: ISD::ROTL, VT, Action: Legal);
974 // With hasP8Altivec set, we can lower ISD::ROTL to vrld.
975 if (Subtarget.hasP8Altivec())
976 setOperationAction(Op: ISD::ROTL, VT: MVT::v2i64, Action: Legal);
977
978 addRegisterClass(VT: MVT::v4f32, RC: &PPC::VRRCRegClass);
979 addRegisterClass(VT: MVT::v4i32, RC: &PPC::VRRCRegClass);
980 addRegisterClass(VT: MVT::v8i16, RC: &PPC::VRRCRegClass);
981 addRegisterClass(VT: MVT::v16i8, RC: &PPC::VRRCRegClass);
982
983 setOperationAction(Op: ISD::MUL, VT: MVT::v4f32, Action: Legal);
984 setOperationAction(Op: ISD::FMA, VT: MVT::v4f32, Action: Legal);
985
986 if (Subtarget.hasVSX()) {
987 setOperationAction(Op: ISD::FDIV, VT: MVT::v4f32, Action: Legal);
988 setOperationAction(Op: ISD::FSQRT, VT: MVT::v4f32, Action: Legal);
989 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2f64, Action: Custom);
990 }
991
992 if (Subtarget.hasP8Altivec())
993 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Legal);
994 else
995 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
996
997 if (Subtarget.isISA3_1()) {
998 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Legal);
999 setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Legal);
1000 setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Legal);
1001 setOperationAction(Op: ISD::MULHS, VT: MVT::v4i32, Action: Legal);
1002 setOperationAction(Op: ISD::MULHU, VT: MVT::v4i32, Action: Legal);
1003 setOperationAction(Op: ISD::UDIV, VT: MVT::v2i64, Action: Legal);
1004 setOperationAction(Op: ISD::SDIV, VT: MVT::v2i64, Action: Legal);
1005 setOperationAction(Op: ISD::UDIV, VT: MVT::v4i32, Action: Legal);
1006 setOperationAction(Op: ISD::SDIV, VT: MVT::v4i32, Action: Legal);
1007 setOperationAction(Op: ISD::UREM, VT: MVT::v2i64, Action: Legal);
1008 setOperationAction(Op: ISD::SREM, VT: MVT::v2i64, Action: Legal);
1009 setOperationAction(Op: ISD::UREM, VT: MVT::v4i32, Action: Legal);
1010 setOperationAction(Op: ISD::SREM, VT: MVT::v4i32, Action: Legal);
1011 setOperationAction(Op: ISD::UREM, VT: MVT::v1i128, Action: Legal);
1012 setOperationAction(Op: ISD::SREM, VT: MVT::v1i128, Action: Legal);
1013 setOperationAction(Op: ISD::UDIV, VT: MVT::v1i128, Action: Legal);
1014 setOperationAction(Op: ISD::SDIV, VT: MVT::v1i128, Action: Legal);
1015 setOperationAction(Op: ISD::ROTL, VT: MVT::v1i128, Action: Legal);
1016 }
1017
1018 setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Legal);
1019 setOperationAction(Op: ISD::MUL, VT: MVT::v16i8, Action: Custom);
1020
1021 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4f32, Action: Custom);
1022 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4i32, Action: Custom);
1023 // LE is P8+/64-bit so direct moves are supported and these operations
1024 // are legal. The custom transformation requires 64-bit since we need a
1025 // pair of stores that will cover a 128-bit load for P10.
1026 if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
1027 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2i64, Action: Custom);
1028 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v8i16, Action: Custom);
1029 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v16i8, Action: Custom);
1030 }
1031
1032 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v16i8, Action: Custom);
1033 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v8i16, Action: Custom);
1034 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4i32, Action: Custom);
1035 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4f32, Action: Custom);
1036
1037 // Altivec does not contain unordered floating-point compare instructions
1038 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::v4f32, Action: Expand);
1039 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::v4f32, Action: Expand);
1040 setCondCodeAction(CCs: ISD::SETO, VT: MVT::v4f32, Action: Expand);
1041 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::v4f32, Action: Expand);
1042
1043 if (Subtarget.hasVSX()) {
1044 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2f64, Action: Legal);
1045 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f64, Action: Legal);
1046 if (Subtarget.hasP8Vector()) {
1047 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4f32, Action: Legal);
1048 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4f32, Action: Legal);
1049 }
1050 if (Subtarget.hasDirectMove() && isPPC64) {
1051 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v16i8, Action: Legal);
1052 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v8i16, Action: Legal);
1053 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v4i32, Action: Legal);
1054 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: MVT::v2i64, Action: Legal);
1055 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v16i8, Action: Legal);
1056 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v8i16, Action: Legal);
1057 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4i32, Action: Legal);
1058 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2i64, Action: Legal);
1059 }
1060 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f64, Action: Legal);
1061
1062 // The nearbyint variants are not allowed to raise the inexact exception
1063 // so we can only code-gen them with fpexcept.ignore.
1064 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f64, Action: Custom);
1065 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f32, Action: Custom);
1066 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::v2f64, Action: Custom);
1067 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::v4f32, Action: Custom);
1068
1069 setOperationAction(Op: ISD::FFLOOR, VT: MVT::v2f64, Action: Legal);
1070 setOperationAction(Op: ISD::FCEIL, VT: MVT::v2f64, Action: Legal);
1071 setOperationAction(Op: ISD::FTRUNC, VT: MVT::v2f64, Action: Legal);
1072 setOperationAction(Op: ISD::FRINT, VT: MVT::v2f64, Action: Legal);
1073 setOperationAction(Op: ISD::FROUND, VT: MVT::v2f64, Action: Legal);
1074 setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Legal);
1075 setOperationAction(Op: ISD::FRINT, VT: MVT::f64, Action: Legal);
1076
1077 setOperationAction(Op: ISD::FRINT, VT: MVT::v4f32, Action: Legal);
1078 setOperationAction(Op: ISD::FROUND, VT: MVT::v4f32, Action: Legal);
1079 setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Legal);
1080 setOperationAction(Op: ISD::FRINT, VT: MVT::f32, Action: Legal);
1081
1082 setOperationAction(Op: ISD::MUL, VT: MVT::v2f64, Action: Legal);
1083 setOperationAction(Op: ISD::FMA, VT: MVT::v2f64, Action: Legal);
1084
1085 setOperationAction(Op: ISD::FDIV, VT: MVT::v2f64, Action: Legal);
1086 setOperationAction(Op: ISD::FSQRT, VT: MVT::v2f64, Action: Legal);
1087
1088 // Share the Altivec comparison restrictions.
1089 setCondCodeAction(CCs: ISD::SETUO, VT: MVT::v2f64, Action: Expand);
1090 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::v2f64, Action: Expand);
1091 setCondCodeAction(CCs: ISD::SETO, VT: MVT::v2f64, Action: Expand);
1092 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::v2f64, Action: Expand);
1093
1094 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Legal);
1095 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Legal);
1096
1097 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2f64, Action: Custom);
1098
1099 if (Subtarget.hasP8Vector())
1100 addRegisterClass(VT: MVT::f32, RC: &PPC::VSSRCRegClass);
1101
1102 addRegisterClass(VT: MVT::f64, RC: &PPC::VSFRCRegClass);
1103
1104 addRegisterClass(VT: MVT::v4i32, RC: &PPC::VSRCRegClass);
1105 addRegisterClass(VT: MVT::v4f32, RC: &PPC::VSRCRegClass);
1106 addRegisterClass(VT: MVT::v2f64, RC: &PPC::VSRCRegClass);
1107
1108 if (Subtarget.hasP8Altivec()) {
1109 setOperationAction(Op: ISD::SHL, VT: MVT::v2i64, Action: Legal);
1110 setOperationAction(Op: ISD::SRA, VT: MVT::v2i64, Action: Legal);
1111 setOperationAction(Op: ISD::SRL, VT: MVT::v2i64, Action: Legal);
1112
1113 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1114 // SRL, but not for SRA because of the instructions available:
1115 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
1116 // doing
1117 setOperationAction(Op: ISD::SHL, VT: MVT::v1i128, Action: Expand);
1118 setOperationAction(Op: ISD::SRL, VT: MVT::v1i128, Action: Expand);
1119 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Expand);
1120
1121 setOperationAction(Op: ISD::SETCC, VT: MVT::v2i64, Action: Legal);
1122 }
1123 else {
1124 setOperationAction(Op: ISD::SHL, VT: MVT::v2i64, Action: Expand);
1125 setOperationAction(Op: ISD::SRA, VT: MVT::v2i64, Action: Expand);
1126 setOperationAction(Op: ISD::SRL, VT: MVT::v2i64, Action: Expand);
1127
1128 setOperationAction(Op: ISD::SETCC, VT: MVT::v2i64, Action: Custom);
1129
1130 // VSX v2i64 only supports non-arithmetic operations.
1131 setOperationAction(Op: ISD::ADD, VT: MVT::v2i64, Action: Expand);
1132 setOperationAction(Op: ISD::SUB, VT: MVT::v2i64, Action: Expand);
1133 }
1134
1135 if (Subtarget.isISA3_1())
1136 setOperationAction(Op: ISD::SETCC, VT: MVT::v1i128, Action: Legal);
1137 else
1138 setOperationAction(Op: ISD::SETCC, VT: MVT::v1i128, Action: Expand);
1139
1140 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
1141 AddPromotedToType (Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v2f64);
1142 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
1143 AddPromotedToType (Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v2f64);
1144
1145 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2i64, Action: Custom);
1146
1147 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1148 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1149 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::v2i64, Action: Legal);
1150 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::v2i64, Action: Legal);
1151 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1152 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i64, Action: Legal);
1153 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::v2i64, Action: Legal);
1154 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::v2i64, Action: Legal);
1155
1156 // Custom handling for partial vectors of integers converted to
1157 // floating point. We already have optimal handling for v2i32 through
1158 // the DAG combine, so those aren't necessary.
1159 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1160 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1161 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1162 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1163 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1164 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1165 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1166 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1167 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1168 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1169 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1170 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1171 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i8, Action: Custom);
1172 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i8, Action: Custom);
1173 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v2i16, Action: Custom);
1174 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1175
1176 setOperationAction(Op: ISD::FNEG, VT: MVT::v4f32, Action: Legal);
1177 setOperationAction(Op: ISD::FNEG, VT: MVT::v2f64, Action: Legal);
1178 setOperationAction(Op: ISD::FABS, VT: MVT::v4f32, Action: Legal);
1179 setOperationAction(Op: ISD::FABS, VT: MVT::v2f64, Action: Legal);
1180 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v4f32, Action: Legal);
1181 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v2f64, Action: Legal);
1182
1183 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2i64, Action: Custom);
1184 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2f64, Action: Custom);
1185
1186 // Handle constrained floating-point operations of vector.
1187 // The predictor is `hasVSX` because altivec instruction has
1188 // no exception but VSX vector instruction has.
1189 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::v4f32, Action: Legal);
1190 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::v4f32, Action: Legal);
1191 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::v4f32, Action: Legal);
1192 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::v4f32, Action: Legal);
1193 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::v4f32, Action: Legal);
1194 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::v4f32, Action: Legal);
1195 setOperationAction(Op: ISD::STRICT_FMAXNUM, VT: MVT::v4f32, Action: Legal);
1196 setOperationAction(Op: ISD::STRICT_FMINNUM, VT: MVT::v4f32, Action: Legal);
1197 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::v4f32, Action: Legal);
1198 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::v4f32, Action: Legal);
1199 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::v4f32, Action: Legal);
1200 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::v4f32, Action: Legal);
1201 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::v4f32, Action: Legal);
1202
1203 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::v2f64, Action: Legal);
1204 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::v2f64, Action: Legal);
1205 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::v2f64, Action: Legal);
1206 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::v2f64, Action: Legal);
1207 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::v2f64, Action: Legal);
1208 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::v2f64, Action: Legal);
1209 setOperationAction(Op: ISD::STRICT_FMAXNUM, VT: MVT::v2f64, Action: Legal);
1210 setOperationAction(Op: ISD::STRICT_FMINNUM, VT: MVT::v2f64, Action: Legal);
1211 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::v2f64, Action: Legal);
1212 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::v2f64, Action: Legal);
1213 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::v2f64, Action: Legal);
1214 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::v2f64, Action: Legal);
1215 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::v2f64, Action: Legal);
1216
1217 addRegisterClass(VT: MVT::v2i64, RC: &PPC::VSRCRegClass);
1218 addRegisterClass(VT: MVT::f128, RC: &PPC::VRRCRegClass);
1219
1220 for (MVT FPT : MVT::fp_valuetypes())
1221 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f128, MemVT: FPT, Action: Expand);
1222
1223 // Expand the SELECT to SELECT_CC
1224 setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Expand);
1225
1226 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f64, Action: Expand);
1227 setTruncStoreAction(ValVT: MVT::f128, MemVT: MVT::f32, Action: Expand);
1228
1229 // No implementation for these ops for PowerPC.
1230 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
1231 setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
1232 setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
1233 setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
1234 setOperationAction(Op: ISD::FPOWI, VT: MVT::f128, Action: Expand);
1235 setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: LibCall);
1236 }
1237
1238 if (Subtarget.hasP8Altivec()) {
1239 addRegisterClass(VT: MVT::v2i64, RC: &PPC::VRRCRegClass);
1240 addRegisterClass(VT: MVT::v1i128, RC: &PPC::VRRCRegClass);
1241 }
1242
1243 if (Subtarget.hasP9Vector()) {
1244 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i32, Action: Custom);
1245 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4f32, Action: Custom);
1246
1247 // Test data class instructions store results in CR bits.
1248 if (Subtarget.useCRBits()) {
1249 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f32, Action: Custom);
1250 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f64, Action: Custom);
1251 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::f128, Action: Custom);
1252 setOperationAction(Op: ISD::IS_FPCLASS, VT: MVT::ppcf128, Action: Custom);
1253 }
1254
1255 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1256 // SRL, but not for SRA because of the instructions available:
1257 // VS{RL} and VS{RL}O.
1258 setOperationAction(Op: ISD::SHL, VT: MVT::v1i128, Action: Legal);
1259 setOperationAction(Op: ISD::SRL, VT: MVT::v1i128, Action: Legal);
1260 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Expand);
1261
1262 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: Legal);
1263 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: Legal);
1264 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: Legal);
1265 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: Legal);
1266 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Legal);
1267
1268 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Legal);
1269 setCondCodeAction(CCs: ISD::SETULT, VT: MVT::f128, Action: Expand);
1270 setCondCodeAction(CCs: ISD::SETUGT, VT: MVT::f128, Action: Expand);
1271 setCondCodeAction(CCs: ISD::SETUEQ, VT: MVT::f128, Action: Expand);
1272 setCondCodeAction(CCs: ISD::SETOGE, VT: MVT::f128, Action: Expand);
1273 setCondCodeAction(CCs: ISD::SETOLE, VT: MVT::f128, Action: Expand);
1274 setCondCodeAction(CCs: ISD::SETONE, VT: MVT::f128, Action: Expand);
1275
1276 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Legal);
1277 setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Legal);
1278 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f128, Action: Legal);
1279 setOperationAction(Op: ISD::FCEIL, VT: MVT::f128, Action: Legal);
1280 setOperationAction(Op: ISD::FNEARBYINT, VT: MVT::f128, Action: Legal);
1281 setOperationAction(Op: ISD::FROUND, VT: MVT::f128, Action: Legal);
1282
1283 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Legal);
1284 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Legal);
1285 setOperationAction(Op: ISD::BITCAST, VT: MVT::i128, Action: Custom);
1286
1287 // Handle constrained floating-point operations of fp128
1288 setOperationAction(Op: ISD::STRICT_FADD, VT: MVT::f128, Action: Legal);
1289 setOperationAction(Op: ISD::STRICT_FSUB, VT: MVT::f128, Action: Legal);
1290 setOperationAction(Op: ISD::STRICT_FMUL, VT: MVT::f128, Action: Legal);
1291 setOperationAction(Op: ISD::STRICT_FDIV, VT: MVT::f128, Action: Legal);
1292 setOperationAction(Op: ISD::STRICT_FMA, VT: MVT::f128, Action: Legal);
1293 setOperationAction(Op: ISD::STRICT_FSQRT, VT: MVT::f128, Action: Legal);
1294 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f128, Action: Legal);
1295 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Legal);
1296 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Legal);
1297 setOperationAction(Op: ISD::STRICT_FRINT, VT: MVT::f128, Action: Legal);
1298 setOperationAction(Op: ISD::STRICT_FNEARBYINT, VT: MVT::f128, Action: Legal);
1299 setOperationAction(Op: ISD::STRICT_FFLOOR, VT: MVT::f128, Action: Legal);
1300 setOperationAction(Op: ISD::STRICT_FCEIL, VT: MVT::f128, Action: Legal);
1301 setOperationAction(Op: ISD::STRICT_FTRUNC, VT: MVT::f128, Action: Legal);
1302 setOperationAction(Op: ISD::STRICT_FROUND, VT: MVT::f128, Action: Legal);
1303 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Custom);
1304 setOperationAction(Op: ISD::BSWAP, VT: MVT::v8i16, Action: Legal);
1305 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i32, Action: Legal);
1306 setOperationAction(Op: ISD::BSWAP, VT: MVT::v2i64, Action: Legal);
1307 setOperationAction(Op: ISD::BSWAP, VT: MVT::v1i128, Action: Legal);
1308 } else if (Subtarget.hasVSX()) {
1309 setOperationAction(Op: ISD::LOAD, VT: MVT::f128, Action: Promote);
1310 setOperationAction(Op: ISD::STORE, VT: MVT::f128, Action: Promote);
1311
1312 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f128, DestVT: MVT::v4i32);
1313 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f128, DestVT: MVT::v4i32);
1314
1315 // Set FADD/FSUB as libcall to avoid the legalizer to expand the
1316 // fp_to_uint and int_to_fp.
1317 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
1318 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
1319
1320 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: Expand);
1321 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: Expand);
1322 setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
1323 setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
1324 setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
1325 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
1326 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
1327
1328 // Expand the fp_extend if the target type is fp128.
1329 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Expand);
1330 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f128, Action: Expand);
1331
1332 // Expand the fp_round if the source type is fp128.
1333 for (MVT VT : {MVT::f32, MVT::f64}) {
1334 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1335 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Custom);
1336 }
1337
1338 setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
1339 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
1340 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
1341 setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Expand);
1342
1343 // Lower following f128 select_cc pattern:
1344 // select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE
1345 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
1346
1347 // We need to handle f128 SELECT_CC with integer result type.
1348 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
1349 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: isPPC64 ? Custom : Expand);
1350 }
1351
1352 if (Subtarget.hasP9Altivec()) {
1353 if (Subtarget.isISA3_1()) {
1354 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2i64, Action: Legal);
1355 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v8i16, Action: Legal);
1356 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v16i8, Action: Legal);
1357 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i32, Action: Legal);
1358 } else {
1359 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v8i16, Action: Custom);
1360 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v16i8, Action: Custom);
1361 }
1362 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i8, Action: Legal);
1363 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i16, Action: Legal);
1364 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v4i32, Action: Legal);
1365 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i8, Action: Legal);
1366 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i16, Action: Legal);
1367 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i32, Action: Legal);
1368 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i64, Action: Legal);
1369
1370 setOperationAction(Op: ISD::ABDU, VT: MVT::v16i8, Action: Legal);
1371 setOperationAction(Op: ISD::ABDU, VT: MVT::v8i16, Action: Legal);
1372 setOperationAction(Op: ISD::ABDU, VT: MVT::v4i32, Action: Legal);
1373 setOperationAction(Op: ISD::ABDS, VT: MVT::v4i32, Action: Legal);
1374 }
1375
1376 if (Subtarget.hasP10Vector()) {
1377 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
1378 }
1379 }
1380
1381 if (Subtarget.pairedVectorMemops()) {
1382 addRegisterClass(VT: MVT::v256i1, RC: &PPC::VSRpRCRegClass);
1383 setOperationAction(Op: ISD::LOAD, VT: MVT::v256i1, Action: Custom);
1384 setOperationAction(Op: ISD::STORE, VT: MVT::v256i1, Action: Custom);
1385 }
1386 if (Subtarget.hasMMA()) {
1387 if (Subtarget.isISAFuture()) {
1388 addRegisterClass(VT: MVT::v512i1, RC: &PPC::WACCRCRegClass);
1389 addRegisterClass(VT: MVT::v1024i1, RC: &PPC::DMRRCRegClass);
1390 addRegisterClass(VT: MVT::v2048i1, RC: &PPC::DMRpRCRegClass);
1391 setOperationAction(Op: ISD::LOAD, VT: MVT::v1024i1, Action: Custom);
1392 setOperationAction(Op: ISD::STORE, VT: MVT::v1024i1, Action: Custom);
1393 setOperationAction(Op: ISD::LOAD, VT: MVT::v2048i1, Action: Custom);
1394 setOperationAction(Op: ISD::STORE, VT: MVT::v2048i1, Action: Custom);
1395 } else {
1396 addRegisterClass(VT: MVT::v512i1, RC: &PPC::UACCRCRegClass);
1397 }
1398 setOperationAction(Op: ISD::LOAD, VT: MVT::v512i1, Action: Custom);
1399 setOperationAction(Op: ISD::STORE, VT: MVT::v512i1, Action: Custom);
1400 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v512i1, Action: Custom);
1401 }
1402
1403 if (Subtarget.has64BitSupport())
1404 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Legal);
1405
1406 if (Subtarget.isISA3_1())
1407 setOperationAction(Op: ISD::SRA, VT: MVT::v1i128, Action: Legal);
1408
1409 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: isPPC64 ? Legal : Custom);
1410
1411 if (!isPPC64) {
1412 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i64, Action: Expand);
1413 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i64, Action: Expand);
1414 }
1415
1416 if (shouldInlineQuadwordAtomics()) {
1417 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
1418 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
1419 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::i128, Action: Custom);
1420 }
1421
1422 setBooleanContents(ZeroOrOneBooleanContent);
1423
1424 if (Subtarget.hasAltivec()) {
1425 // Altivec instructions set fields to all zeros or all ones.
1426 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
1427 }
1428
1429 if (shouldInlineQuadwordAtomics())
1430 setMaxAtomicSizeInBitsSupported(128);
1431 else if (isPPC64)
1432 setMaxAtomicSizeInBitsSupported(64);
1433 else
1434 setMaxAtomicSizeInBitsSupported(32);
1435
1436 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
1437
1438 // We have target-specific dag combine patterns for the following nodes:
1439 setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL,
1440 ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR});
1441 if (Subtarget.hasFPCVT())
1442 setTargetDAGCombine(ISD::UINT_TO_FP);
1443 setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});
1444 if (Subtarget.useCRBits())
1445 setTargetDAGCombine(ISD::BRCOND);
1446 setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,
1447 ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});
1448
1449 setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});
1450
1451 setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});
1452
1453 if (Subtarget.useCRBits()) {
1454 setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
1455 }
1456
1457 // With 32 condition bits, we don't need to sink (and duplicate) compares
1458 // aggressively in CodeGenPrep.
1459 if (Subtarget.useCRBits()) {
1460 setJumpIsExpensive();
1461 }
1462
1463 // TODO: The default entry number is set to 64. This stops most jump table
1464 // generation on PPC. But it is good for current PPC HWs because the indirect
1465 // branch instruction mtctr to the jump table may lead to bad branch predict.
1466 // Re-evaluate this value on future HWs that can do better with mtctr.
1467 setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
1468
1469 // The default minimum of largest number in a BitTest cluster is 3.
1470 setMinimumBitTestCmps(PPCMinimumBitTestCmps);
1471
1472 setMinFunctionAlignment(Align(4));
1473 setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
1474
1475 auto CPUDirective = Subtarget.getCPUDirective();
1476 switch (CPUDirective) {
1477 default: break;
1478 case PPC::DIR_970:
1479 case PPC::DIR_A2:
1480 case PPC::DIR_E500:
1481 case PPC::DIR_E500mc:
1482 case PPC::DIR_E5500:
1483 case PPC::DIR_PWR4:
1484 case PPC::DIR_PWR5:
1485 case PPC::DIR_PWR5X:
1486 case PPC::DIR_PWR6:
1487 case PPC::DIR_PWR6X:
1488 case PPC::DIR_PWR7:
1489 case PPC::DIR_PWR8:
1490 case PPC::DIR_PWR9:
1491 case PPC::DIR_PWR10:
1492 case PPC::DIR_PWR11:
1493 case PPC::DIR_PWR_FUTURE:
1494 setPrefLoopAlignment(Align(16));
1495 setPrefFunctionAlignment(Align(16));
1496 break;
1497 }
1498
1499 if (Subtarget.enableMachineScheduler())
1500 setSchedulingPreference(Sched::Source);
1501 else
1502 setSchedulingPreference(Sched::Hybrid);
1503
1504 computeRegisterProperties(TRI: STI.getRegisterInfo());
1505
1506 // The Freescale cores do better with aggressive inlining of memcpy and
1507 // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
1508 if (CPUDirective == PPC::DIR_E500mc || CPUDirective == PPC::DIR_E5500) {
1509 MaxStoresPerMemset = 32;
1510 MaxStoresPerMemsetOptSize = 16;
1511 MaxStoresPerMemcpy = 32;
1512 MaxStoresPerMemcpyOptSize = 8;
1513 MaxStoresPerMemmove = 32;
1514 MaxStoresPerMemmoveOptSize = 8;
1515 } else if (CPUDirective == PPC::DIR_A2) {
1516 // The A2 also benefits from (very) aggressive inlining of memcpy and
1517 // friends. The overhead of a the function call, even when warm, can be
1518 // over one hundred cycles.
1519 MaxStoresPerMemset = 128;
1520 MaxStoresPerMemcpy = 128;
1521 MaxStoresPerMemmove = 128;
1522 MaxLoadsPerMemcmp = 128;
1523 } else {
1524 MaxLoadsPerMemcmp = 8;
1525 MaxLoadsPerMemcmpOptSize = 4;
1526 }
1527
1528 // Enable generation of STXVP instructions by default for mcpu=future.
1529 if (CPUDirective == PPC::DIR_PWR_FUTURE &&
1530 DisableAutoPairedVecSt.getNumOccurrences() == 0)
1531 DisableAutoPairedVecSt = false;
1532
1533 IsStrictFPEnabled = true;
1534
1535 // Let the subtarget (CPU) decide if a predictable select is more expensive
1536 // than the corresponding branch. This information is used in CGP to decide
1537 // when to convert selects into branches.
1538 PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
1539
1540 GatherAllAliasesMaxDepth = PPCGatherAllAliasesMaxDepth;
1541}
1542
1543// *********************************** NOTE ************************************
1544// For selecting load and store instructions, the addressing modes are defined
1545// as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD
1546// patterns to match the load the store instructions.
1547//
1548// The TD definitions for the addressing modes correspond to their respective
1549// Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely
1550// on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the
1551// address mode flags of a particular node. Afterwards, the computed address
1552// flags are passed into getAddrModeForFlags() in order to retrieve the optimal
1553// addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement
1554// accordingly, based on the preferred addressing mode.
1555//
1556// Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.
1557// MemOpFlags contains all the possible flags that can be used to compute the
1558// optimal addressing mode for load and store instructions.
1559// AddrMode contains all the possible load and store addressing modes available
1560// on Power (such as DForm, DSForm, DQForm, XForm, etc.)
1561//
1562// When adding new load and store instructions, it is possible that new address
1563// flags may need to be added into MemOpFlags, and a new addressing mode will
1564// need to be added to AddrMode. An entry of the new addressing mode (consisting
1565// of the minimal and main distinguishing address flags for the new load/store
1566// instructions) will need to be added into initializeAddrModeMap() below.
1567// Finally, when adding new addressing modes, the getAddrModeForFlags() will
1568// need to be updated to account for selecting the optimal addressing mode.
1569// *****************************************************************************
1570/// Initialize the map that relates the different addressing modes of the load
1571/// and store instructions to a set of flags. This ensures the load/store
1572/// instruction is correctly matched during instruction selection.
1573void PPCTargetLowering::initializeAddrModeMap() {
1574 AddrModesMap[PPC::AM_DForm] = {
1575 // LWZ, STW
1576 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,
1577 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,
1578 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1579 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1580 // LBZ, LHZ, STB, STH
1581 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1582 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1583 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1584 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1585 // LHA
1586 PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1587 PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1588 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1589 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1590 // LFS, LFD, STFS, STFD
1591 PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1592 PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1593 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1594 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1595 };
1596 AddrModesMap[PPC::AM_DSForm] = {
1597 // LWA
1598 PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,
1599 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1600 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1601 // LD, STD
1602 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,
1603 PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,
1604 PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,
1605 // DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf64
1606 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1607 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1608 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1609 };
1610 AddrModesMap[PPC::AM_DQForm] = {
1611 // LXV, STXV
1612 PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1613 PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1614 PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1615 };
1616 AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |
1617 PPC::MOF_SubtargetP10};
1618 // TODO: Add mapping for quadword load/store.
1619}
1620
1621/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1622/// the desired ByVal argument alignment.
1623static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
1624 if (MaxAlign == MaxMaxAlign)
1625 return;
1626 if (VectorType *VTy = dyn_cast<VectorType>(Val: Ty)) {
1627 if (MaxMaxAlign >= 32 &&
1628 VTy->getPrimitiveSizeInBits().getFixedValue() >= 256)
1629 MaxAlign = Align(32);
1630 else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 &&
1631 MaxAlign < 16)
1632 MaxAlign = Align(16);
1633 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
1634 Align EltAlign;
1635 getMaxByValAlign(Ty: ATy->getElementType(), MaxAlign&: EltAlign, MaxMaxAlign);
1636 if (EltAlign > MaxAlign)
1637 MaxAlign = EltAlign;
1638 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
1639 for (auto *EltTy : STy->elements()) {
1640 Align EltAlign;
1641 getMaxByValAlign(Ty: EltTy, MaxAlign&: EltAlign, MaxMaxAlign);
1642 if (EltAlign > MaxAlign)
1643 MaxAlign = EltAlign;
1644 if (MaxAlign == MaxMaxAlign)
1645 break;
1646 }
1647 }
1648}
1649
1650/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1651/// function arguments in the caller parameter area.
1652Align PPCTargetLowering::getByValTypeAlignment(Type *Ty,
1653 const DataLayout &DL) const {
1654 // 16byte and wider vectors are passed on 16byte boundary.
1655 // The rest is 8 on PPC64 and 4 on PPC32 boundary.
1656 Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
1657 if (Subtarget.hasAltivec())
1658 getMaxByValAlign(Ty, MaxAlign&: Alignment, MaxMaxAlign: Align(16));
1659 return Alignment;
1660}
1661
1662bool PPCTargetLowering::useSoftFloat() const {
1663 return Subtarget.useSoftFloat();
1664}
1665
1666bool PPCTargetLowering::hasSPE() const {
1667 return Subtarget.hasSPE();
1668}
1669
1670bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
1671 return VT.isScalarInteger();
1672}
1673
1674bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
1675 Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
1676 if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
1677 return false;
1678
1679 if (auto *VTy = dyn_cast<VectorType>(Val: VectorTy)) {
1680 if (VTy->getScalarType()->isIntegerTy()) {
1681 // ElemSizeInBits 8/16 can fit in immediate field, not needed here.
1682 if (ElemSizeInBits == 32) {
1683 Index = Subtarget.isLittleEndian() ? 2 : 1;
1684 return true;
1685 }
1686 if (ElemSizeInBits == 64) {
1687 Index = Subtarget.isLittleEndian() ? 1 : 0;
1688 return true;
1689 }
1690 }
1691 }
1692 return false;
1693}
1694
1695EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
1696 EVT VT) const {
1697 if (!VT.isVector())
1698 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
1699
1700 return VT.changeVectorElementTypeToInteger();
1701}
1702
1703bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1704 assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
1705 return true;
1706}
1707
1708//===----------------------------------------------------------------------===//
1709// Node matching predicates, for use by the tblgen matching code.
1710//===----------------------------------------------------------------------===//
1711
1712/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
1713static bool isFloatingPointZero(SDValue Op) {
1714 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op))
1715 return CFP->getValueAPF().isZero();
1716 else if (ISD::isEXTLoad(N: Op.getNode()) || ISD::isNON_EXTLoad(N: Op.getNode())) {
1717 // Maybe this has already been legalized into the constant pool?
1718 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Val: Op.getOperand(i: 1)))
1719 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: CP->getConstVal()))
1720 return CFP->getValueAPF().isZero();
1721 }
1722 return false;
1723}
1724
1725/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
1726/// true if Op is undef or if it matches the specified value.
1727static bool isConstantOrUndef(int Op, int Val) {
1728 return Op < 0 || Op == Val;
1729}
1730
1731/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
1732/// VPKUHUM instruction.
1733/// The ShuffleKind distinguishes between big-endian operations with
1734/// two different inputs (0), either-endian operations with two identical
1735/// inputs (1), and little-endian operations with two different inputs (2).
1736/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1737bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1738 SelectionDAG &DAG) {
1739 bool IsLE = DAG.getDataLayout().isLittleEndian();
1740 if (ShuffleKind == 0) {
1741 if (IsLE)
1742 return false;
1743 for (unsigned i = 0; i != 16; ++i)
1744 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2+1))
1745 return false;
1746 } else if (ShuffleKind == 2) {
1747 if (!IsLE)
1748 return false;
1749 for (unsigned i = 0; i != 16; ++i)
1750 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2))
1751 return false;
1752 } else if (ShuffleKind == 1) {
1753 unsigned j = IsLE ? 0 : 1;
1754 for (unsigned i = 0; i != 8; ++i)
1755 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i), Val: i*2+j) ||
1756 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j))
1757 return false;
1758 }
1759 return true;
1760}
1761
1762/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
1763/// VPKUWUM instruction.
1764/// The ShuffleKind distinguishes between big-endian operations with
1765/// two different inputs (0), either-endian operations with two identical
1766/// inputs (1), and little-endian operations with two different inputs (2).
1767/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1768bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1769 SelectionDAG &DAG) {
1770 bool IsLE = DAG.getDataLayout().isLittleEndian();
1771 if (ShuffleKind == 0) {
1772 if (IsLE)
1773 return false;
1774 for (unsigned i = 0; i != 16; i += 2)
1775 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+2) ||
1776 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+3))
1777 return false;
1778 } else if (ShuffleKind == 2) {
1779 if (!IsLE)
1780 return false;
1781 for (unsigned i = 0; i != 16; i += 2)
1782 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2) ||
1783 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+1))
1784 return false;
1785 } else if (ShuffleKind == 1) {
1786 unsigned j = IsLE ? 0 : 2;
1787 for (unsigned i = 0; i != 8; i += 2)
1788 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+j) ||
1789 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+j+1) ||
1790 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j) ||
1791 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+9), Val: i*2+j+1))
1792 return false;
1793 }
1794 return true;
1795}
1796
1797/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
1798/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
1799/// current subtarget.
1800///
1801/// The ShuffleKind distinguishes between big-endian operations with
1802/// two different inputs (0), either-endian operations with two identical
1803/// inputs (1), and little-endian operations with two different inputs (2).
1804/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1805bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1806 SelectionDAG &DAG) {
1807 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
1808 if (!Subtarget.hasP8Vector())
1809 return false;
1810
1811 bool IsLE = DAG.getDataLayout().isLittleEndian();
1812 if (ShuffleKind == 0) {
1813 if (IsLE)
1814 return false;
1815 for (unsigned i = 0; i != 16; i += 4)
1816 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+4) ||
1817 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+5) ||
1818 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+6) ||
1819 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+7))
1820 return false;
1821 } else if (ShuffleKind == 2) {
1822 if (!IsLE)
1823 return false;
1824 for (unsigned i = 0; i != 16; i += 4)
1825 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2) ||
1826 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+1) ||
1827 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+2) ||
1828 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+3))
1829 return false;
1830 } else if (ShuffleKind == 1) {
1831 unsigned j = IsLE ? 0 : 4;
1832 for (unsigned i = 0; i != 8; i += 4)
1833 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i ), Val: i*2+j) ||
1834 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+1), Val: i*2+j+1) ||
1835 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+2), Val: i*2+j+2) ||
1836 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+3), Val: i*2+j+3) ||
1837 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+8), Val: i*2+j) ||
1838 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+9), Val: i*2+j+1) ||
1839 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+10), Val: i*2+j+2) ||
1840 !isConstantOrUndef(Op: N->getMaskElt(Idx: i+11), Val: i*2+j+3))
1841 return false;
1842 }
1843 return true;
1844}
1845
1846/// isVMerge - Common function, used to match vmrg* shuffles.
1847///
1848static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
1849 unsigned LHSStart, unsigned RHSStart) {
1850 if (N->getValueType(ResNo: 0) != MVT::v16i8)
1851 return false;
1852 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
1853 "Unsupported merge size!");
1854
1855 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
1856 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
1857 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i*UnitSize*2+j),
1858 Val: LHSStart+j+i*UnitSize) ||
1859 !isConstantOrUndef(Op: N->getMaskElt(Idx: i*UnitSize*2+UnitSize+j),
1860 Val: RHSStart+j+i*UnitSize))
1861 return false;
1862 }
1863 return true;
1864}
1865
1866/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
1867/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
1868/// The ShuffleKind distinguishes between big-endian merges with two
1869/// different inputs (0), either-endian merges with two identical inputs (1),
1870/// and little-endian merges with two different inputs (2). For the latter,
1871/// the input operands are swapped (see PPCInstrAltivec.td).
1872bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1873 unsigned ShuffleKind, SelectionDAG &DAG) {
1874 if (DAG.getDataLayout().isLittleEndian()) {
1875 if (ShuffleKind == 1) // unary
1876 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 0);
1877 else if (ShuffleKind == 2) // swapped
1878 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 16);
1879 else
1880 return false;
1881 } else {
1882 if (ShuffleKind == 1) // unary
1883 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 8);
1884 else if (ShuffleKind == 0) // normal
1885 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 24);
1886 else
1887 return false;
1888 }
1889}
1890
1891/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
1892/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
1893/// The ShuffleKind distinguishes between big-endian merges with two
1894/// different inputs (0), either-endian merges with two identical inputs (1),
1895/// and little-endian merges with two different inputs (2). For the latter,
1896/// the input operands are swapped (see PPCInstrAltivec.td).
1897bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1898 unsigned ShuffleKind, SelectionDAG &DAG) {
1899 if (DAG.getDataLayout().isLittleEndian()) {
1900 if (ShuffleKind == 1) // unary
1901 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 8);
1902 else if (ShuffleKind == 2) // swapped
1903 return isVMerge(N, UnitSize, LHSStart: 8, RHSStart: 24);
1904 else
1905 return false;
1906 } else {
1907 if (ShuffleKind == 1) // unary
1908 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 0);
1909 else if (ShuffleKind == 0) // normal
1910 return isVMerge(N, UnitSize, LHSStart: 0, RHSStart: 16);
1911 else
1912 return false;
1913 }
1914}
1915
1916/**
1917 * Common function used to match vmrgew and vmrgow shuffles
1918 *
1919 * The indexOffset determines whether to look for even or odd words in
1920 * the shuffle mask. This is based on the of the endianness of the target
1921 * machine.
1922 * - Little Endian:
1923 * - Use offset of 0 to check for odd elements
1924 * - Use offset of 4 to check for even elements
1925 * - Big Endian:
1926 * - Use offset of 0 to check for even elements
1927 * - Use offset of 4 to check for odd elements
1928 * A detailed description of the vector element ordering for little endian and
1929 * big endian can be found at
1930 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
1931 * Targeting your applications - what little endian and big endian IBM XL C/C++
1932 * compiler differences mean to you
1933 *
1934 * The mask to the shuffle vector instruction specifies the indices of the
1935 * elements from the two input vectors to place in the result. The elements are
1936 * numbered in array-access order, starting with the first vector. These vectors
1937 * are always of type v16i8, thus each vector will contain 16 elements of size
1938 * 8. More info on the shuffle vector can be found in the
1939 * http://llvm.org/docs/LangRef.html#shufflevector-instruction
1940 * Language Reference.
1941 *
1942 * The RHSStartValue indicates whether the same input vectors are used (unary)
1943 * or two different input vectors are used, based on the following:
1944 * - If the instruction uses the same vector for both inputs, the range of the
1945 * indices will be 0 to 15. In this case, the RHSStart value passed should
1946 * be 0.
1947 * - If the instruction has two different vectors then the range of the
1948 * indices will be 0 to 31. In this case, the RHSStart value passed should
1949 * be 16 (indices 0-15 specify elements in the first vector while indices 16
1950 * to 31 specify elements in the second vector).
1951 *
1952 * \param[in] N The shuffle vector SD Node to analyze
1953 * \param[in] IndexOffset Specifies whether to look for even or odd elements
1954 * \param[in] RHSStartValue Specifies the starting index for the righthand input
1955 * vector to the shuffle_vector instruction
1956 * \return true iff this shuffle vector represents an even or odd word merge
1957 */
1958static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
1959 unsigned RHSStartValue) {
1960 if (N->getValueType(ResNo: 0) != MVT::v16i8)
1961 return false;
1962
1963 for (unsigned i = 0; i < 2; ++i)
1964 for (unsigned j = 0; j < 4; ++j)
1965 if (!isConstantOrUndef(Op: N->getMaskElt(Idx: i*4+j),
1966 Val: i*RHSStartValue+j+IndexOffset) ||
1967 !isConstantOrUndef(Op: N->getMaskElt(Idx: i*4+j+8),
1968 Val: i*RHSStartValue+j+IndexOffset+8))
1969 return false;
1970 return true;
1971}
1972
1973/**
1974 * Determine if the specified shuffle mask is suitable for the vmrgew or
1975 * vmrgow instructions.
1976 *
1977 * \param[in] N The shuffle vector SD Node to analyze
1978 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
1979 * \param[in] ShuffleKind Identify the type of merge:
1980 * - 0 = big-endian merge with two different inputs;
1981 * - 1 = either-endian merge with two identical inputs;
1982 * - 2 = little-endian merge with two different inputs (inputs are swapped for
1983 * little-endian merges).
1984 * \param[in] DAG The current SelectionDAG
1985 * \return true iff this shuffle mask
1986 */
1987bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
1988 unsigned ShuffleKind, SelectionDAG &DAG) {
1989 if (DAG.getDataLayout().isLittleEndian()) {
1990 unsigned indexOffset = CheckEven ? 4 : 0;
1991 if (ShuffleKind == 1) // Unary
1992 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 0);
1993 else if (ShuffleKind == 2) // swapped
1994 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 16);
1995 else
1996 return false;
1997 }
1998 else {
1999 unsigned indexOffset = CheckEven ? 0 : 4;
2000 if (ShuffleKind == 1) // Unary
2001 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 0);
2002 else if (ShuffleKind == 0) // Normal
2003 return isVMerge(N, IndexOffset: indexOffset, RHSStartValue: 16);
2004 else
2005 return false;
2006 }
2007 return false;
2008}
2009
2010/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
2011/// amount, otherwise return -1.
2012/// The ShuffleKind distinguishes between big-endian operations with two
2013/// different inputs (0), either-endian operations with two identical inputs
2014/// (1), and little-endian operations with two different inputs (2). For the
2015/// latter, the input operands are swapped (see PPCInstrAltivec.td).
2016int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
2017 SelectionDAG &DAG) {
2018 if (N->getValueType(ResNo: 0) != MVT::v16i8)
2019 return -1;
2020
2021 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val: N);
2022
2023 // Find the first non-undef value in the shuffle mask.
2024 unsigned i;
2025 for (i = 0; i != 16 && SVOp->getMaskElt(Idx: i) < 0; ++i)
2026 /*search*/;
2027
2028 if (i == 16) return -1; // all undef.
2029
2030 // Otherwise, check to see if the rest of the elements are consecutively
2031 // numbered from this value.
2032 unsigned ShiftAmt = SVOp->getMaskElt(Idx: i);
2033 if (ShiftAmt < i) return -1;
2034
2035 ShiftAmt -= i;
2036 bool isLE = DAG.getDataLayout().isLittleEndian();
2037
2038 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
2039 // Check the rest of the elements to see if they are consecutive.
2040 for (++i; i != 16; ++i)
2041 if (!isConstantOrUndef(Op: SVOp->getMaskElt(Idx: i), Val: ShiftAmt+i))
2042 return -1;
2043 } else if (ShuffleKind == 1) {
2044 // Check the rest of the elements to see if they are consecutive.
2045 for (++i; i != 16; ++i)
2046 if (!isConstantOrUndef(Op: SVOp->getMaskElt(Idx: i), Val: (ShiftAmt+i) & 15))
2047 return -1;
2048 } else
2049 return -1;
2050
2051 if (isLE)
2052 ShiftAmt = 16 - ShiftAmt;
2053
2054 return ShiftAmt;
2055}
2056
2057/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
2058/// specifies a splat of a single element that is suitable for input to
2059/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
2060bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
2061 EVT VT = N->getValueType(ResNo: 0);
2062 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2063 return EltSize == 8 && N->getMaskElt(Idx: 0) == N->getMaskElt(Idx: 1);
2064
2065 assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&
2066 EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
2067
2068 // The consecutive indices need to specify an element, not part of two
2069 // different elements. So abandon ship early if this isn't the case.
2070 if (N->getMaskElt(Idx: 0) % EltSize != 0)
2071 return false;
2072
2073 // This is a splat operation if each element of the permute is the same, and
2074 // if the value doesn't reference the second vector.
2075 unsigned ElementBase = N->getMaskElt(Idx: 0);
2076
2077 // FIXME: Handle UNDEF elements too!
2078 if (ElementBase >= 16)
2079 return false;
2080
2081 // Check that the indices are consecutive, in the case of a multi-byte element
2082 // splatted with a v16i8 mask.
2083 for (unsigned i = 1; i != EltSize; ++i)
2084 if (N->getMaskElt(Idx: i) < 0 || N->getMaskElt(Idx: i) != (int)(i+ElementBase))
2085 return false;
2086
2087 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
2088 // An UNDEF element is a sequence of UNDEF bytes.
2089 if (N->getMaskElt(Idx: i) < 0) {
2090 for (unsigned j = 1; j != EltSize; ++j)
2091 if (N->getMaskElt(Idx: i + j) >= 0)
2092 return false;
2093 } else
2094 for (unsigned j = 0; j != EltSize; ++j)
2095 if (N->getMaskElt(Idx: i + j) != N->getMaskElt(Idx: j))
2096 return false;
2097 }
2098 return true;
2099}
2100
2101/// Check that the mask is shuffling N byte elements. Within each N byte
2102/// element of the mask, the indices could be either in increasing or
2103/// decreasing order as long as they are consecutive.
2104/// \param[in] N the shuffle vector SD Node to analyze
2105/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
2106/// Word/DoubleWord/QuadWord).
2107/// \param[in] StepLen the delta indices number among the N byte element, if
2108/// the mask is in increasing/decreasing order then it is 1/-1.
2109/// \return true iff the mask is shuffling N byte elements.
2110static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
2111 int StepLen) {
2112 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
2113 "Unexpected element width.");
2114 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
2115
2116 unsigned NumOfElem = 16 / Width;
2117 unsigned MaskVal[16]; // Width is never greater than 16
2118 for (unsigned i = 0; i < NumOfElem; ++i) {
2119 MaskVal[0] = N->getMaskElt(Idx: i * Width);
2120 if ((StepLen == 1) && (MaskVal[0] % Width)) {
2121 return false;
2122 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
2123 return false;
2124 }
2125
2126 for (unsigned int j = 1; j < Width; ++j) {
2127 MaskVal[j] = N->getMaskElt(Idx: i * Width + j);
2128 if (MaskVal[j] != MaskVal[j-1] + StepLen) {
2129 return false;
2130 }
2131 }
2132 }
2133
2134 return true;
2135}
2136
2137bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2138 unsigned &InsertAtByte, bool &Swap, bool IsLE) {
2139 if (!isNByteElemShuffleMask(N, Width: 4, StepLen: 1))
2140 return false;
2141
2142 // Now we look at mask elements 0,4,8,12
2143 unsigned M0 = N->getMaskElt(Idx: 0) / 4;
2144 unsigned M1 = N->getMaskElt(Idx: 4) / 4;
2145 unsigned M2 = N->getMaskElt(Idx: 8) / 4;
2146 unsigned M3 = N->getMaskElt(Idx: 12) / 4;
2147 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
2148 unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
2149
2150 // Below, let H and L be arbitrary elements of the shuffle mask
2151 // where H is in the range [4,7] and L is in the range [0,3].
2152 // H, 1, 2, 3 or L, 5, 6, 7
2153 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
2154 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
2155 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
2156 InsertAtByte = IsLE ? 12 : 0;
2157 Swap = M0 < 4;
2158 return true;
2159 }
2160 // 0, H, 2, 3 or 4, L, 6, 7
2161 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
2162 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
2163 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
2164 InsertAtByte = IsLE ? 8 : 4;
2165 Swap = M1 < 4;
2166 return true;
2167 }
2168 // 0, 1, H, 3 or 4, 5, L, 7
2169 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
2170 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
2171 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
2172 InsertAtByte = IsLE ? 4 : 8;
2173 Swap = M2 < 4;
2174 return true;
2175 }
2176 // 0, 1, 2, H or 4, 5, 6, L
2177 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
2178 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
2179 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
2180 InsertAtByte = IsLE ? 0 : 12;
2181 Swap = M3 < 4;
2182 return true;
2183 }
2184
2185 // If both vector operands for the shuffle are the same vector, the mask will
2186 // contain only elements from the first one and the second one will be undef.
2187 if (N->getOperand(Num: 1).isUndef()) {
2188 ShiftElts = 0;
2189 Swap = true;
2190 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
2191 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
2192 InsertAtByte = IsLE ? 12 : 0;
2193 return true;
2194 }
2195 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
2196 InsertAtByte = IsLE ? 8 : 4;
2197 return true;
2198 }
2199 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
2200 InsertAtByte = IsLE ? 4 : 8;
2201 return true;
2202 }
2203 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
2204 InsertAtByte = IsLE ? 0 : 12;
2205 return true;
2206 }
2207 }
2208
2209 return false;
2210}
2211
2212bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2213 bool &Swap, bool IsLE) {
2214 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2215 // Ensure each byte index of the word is consecutive.
2216 if (!isNByteElemShuffleMask(N, Width: 4, StepLen: 1))
2217 return false;
2218
2219 // Now we look at mask elements 0,4,8,12, which are the beginning of words.
2220 unsigned M0 = N->getMaskElt(Idx: 0) / 4;
2221 unsigned M1 = N->getMaskElt(Idx: 4) / 4;
2222 unsigned M2 = N->getMaskElt(Idx: 8) / 4;
2223 unsigned M3 = N->getMaskElt(Idx: 12) / 4;
2224
2225 // If both vector operands for the shuffle are the same vector, the mask will
2226 // contain only elements from the first one and the second one will be undef.
2227 if (N->getOperand(Num: 1).isUndef()) {
2228 assert(M0 < 4 && "Indexing into an undef vector?");
2229 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
2230 return false;
2231
2232 ShiftElts = IsLE ? (4 - M0) % 4 : M0;
2233 Swap = false;
2234 return true;
2235 }
2236
2237 // Ensure each word index of the ShuffleVector Mask is consecutive.
2238 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
2239 return false;
2240
2241 if (IsLE) {
2242 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
2243 // Input vectors don't need to be swapped if the leading element
2244 // of the result is one of the 3 left elements of the second vector
2245 // (or if there is no shift to be done at all).
2246 Swap = false;
2247 ShiftElts = (8 - M0) % 8;
2248 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
2249 // Input vectors need to be swapped if the leading element
2250 // of the result is one of the 3 left elements of the first vector
2251 // (or if we're shifting by 4 - thereby simply swapping the vectors).
2252 Swap = true;
2253 ShiftElts = (4 - M0) % 4;
2254 }
2255
2256 return true;
2257 } else { // BE
2258 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
2259 // Input vectors don't need to be swapped if the leading element
2260 // of the result is one of the 4 elements of the first vector.
2261 Swap = false;
2262 ShiftElts = M0;
2263 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
2264 // Input vectors need to be swapped if the leading element
2265 // of the result is one of the 4 elements of the right vector.
2266 Swap = true;
2267 ShiftElts = M0 - 4;
2268 }
2269
2270 return true;
2271 }
2272}
2273
2274bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
2275 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2276
2277 if (!isNByteElemShuffleMask(N, Width, StepLen: -1))
2278 return false;
2279
2280 for (int i = 0; i < 16; i += Width)
2281 if (N->getMaskElt(Idx: i) != i + Width - 1)
2282 return false;
2283
2284 return true;
2285}
2286
2287bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
2288 return isXXBRShuffleMaskHelper(N, Width: 2);
2289}
2290
2291bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
2292 return isXXBRShuffleMaskHelper(N, Width: 4);
2293}
2294
2295bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
2296 return isXXBRShuffleMaskHelper(N, Width: 8);
2297}
2298
2299bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
2300 return isXXBRShuffleMaskHelper(N, Width: 16);
2301}
2302
2303/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
2304/// if the inputs to the instruction should be swapped and set \p DM to the
2305/// value for the immediate.
2306/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
2307/// AND element 0 of the result comes from the first input (LE) or second input
2308/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
2309/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
2310/// mask.
2311bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
2312 bool &Swap, bool IsLE) {
2313 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2314
2315 // Ensure each byte index of the double word is consecutive.
2316 if (!isNByteElemShuffleMask(N, Width: 8, StepLen: 1))
2317 return false;
2318
2319 unsigned M0 = N->getMaskElt(Idx: 0) / 8;
2320 unsigned M1 = N->getMaskElt(Idx: 8) / 8;
2321 assert(((M0 | M1) < 4) && "A mask element out of bounds?");
2322
2323 // If both vector operands for the shuffle are the same vector, the mask will
2324 // contain only elements from the first one and the second one will be undef.
2325 if (N->getOperand(Num: 1).isUndef()) {
2326 if ((M0 | M1) < 2) {
2327 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
2328 Swap = false;
2329 return true;
2330 } else
2331 return false;
2332 }
2333
2334 if (IsLE) {
2335 if (M0 > 1 && M1 < 2) {
2336 Swap = false;
2337 } else if (M0 < 2 && M1 > 1) {
2338 M0 = (M0 + 2) % 4;
2339 M1 = (M1 + 2) % 4;
2340 Swap = true;
2341 } else
2342 return false;
2343
2344 // Note: if control flow comes here that means Swap is already set above
2345 DM = (((~M1) & 1) << 1) + ((~M0) & 1);
2346 return true;
2347 } else { // BE
2348 if (M0 < 2 && M1 > 1) {
2349 Swap = false;
2350 } else if (M0 > 1 && M1 < 2) {
2351 M0 = (M0 + 2) % 4;
2352 M1 = (M1 + 2) % 4;
2353 Swap = true;
2354 } else
2355 return false;
2356
2357 // Note: if control flow comes here that means Swap is already set above
2358 DM = (M0 << 1) + (M1 & 1);
2359 return true;
2360 }
2361}
2362
2363
2364/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
2365/// appropriate for PPC mnemonics (which have a big endian bias - namely
2366/// elements are counted from the left of the vector register).
2367unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
2368 SelectionDAG &DAG) {
2369 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val: N);
2370 assert(isSplatShuffleMask(SVOp, EltSize));
2371 EVT VT = SVOp->getValueType(ResNo: 0);
2372
2373 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2374 return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(Idx: 0)
2375 : SVOp->getMaskElt(Idx: 0);
2376
2377 if (DAG.getDataLayout().isLittleEndian())
2378 return (16 / EltSize) - 1 - (SVOp->getMaskElt(Idx: 0) / EltSize);
2379 else
2380 return SVOp->getMaskElt(Idx: 0) / EltSize;
2381}
2382
2383/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
2384/// by using a vspltis[bhw] instruction of the specified element size, return
2385/// the constant being splatted. The ByteSize field indicates the number of
2386/// bytes of each element [124] -> [bhw].
2387SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
2388 SDValue OpVal;
2389
2390 // If ByteSize of the splat is bigger than the element size of the
2391 // build_vector, then we have a case where we are checking for a splat where
2392 // multiple elements of the buildvector are folded together into a single
2393 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
2394 unsigned EltSize = 16/N->getNumOperands();
2395 if (EltSize < ByteSize) {
2396 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
2397 SDValue UniquedVals[4];
2398 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
2399
2400 // See if all of the elements in the buildvector agree across.
2401 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2402 if (N->getOperand(Num: i).isUndef()) continue;
2403 // If the element isn't a constant, bail fully out.
2404 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: i))) return SDValue();
2405
2406 if (!UniquedVals[i&(Multiple-1)].getNode())
2407 UniquedVals[i&(Multiple-1)] = N->getOperand(Num: i);
2408 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(Num: i))
2409 return SDValue(); // no match.
2410 }
2411
2412 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
2413 // either constant or undef values that are identical for each chunk. See
2414 // if these chunks can form into a larger vspltis*.
2415
2416 // Check to see if all of the leading entries are either 0 or -1. If
2417 // neither, then this won't fit into the immediate field.
2418 bool LeadingZero = true;
2419 bool LeadingOnes = true;
2420 for (unsigned i = 0; i != Multiple-1; ++i) {
2421 if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
2422
2423 LeadingZero &= isNullConstant(V: UniquedVals[i]);
2424 LeadingOnes &= isAllOnesConstant(V: UniquedVals[i]);
2425 }
2426 // Finally, check the least significant entry.
2427 if (LeadingZero) {
2428 if (!UniquedVals[Multiple-1].getNode())
2429 return DAG.getTargetConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32); // 0,0,0,undef
2430 int Val = UniquedVals[Multiple - 1]->getAsZExtVal();
2431 if (Val < 16) // 0,0,0,4 -> vspltisw(4)
2432 return DAG.getTargetConstant(Val, DL: SDLoc(N), VT: MVT::i32);
2433 }
2434 if (LeadingOnes) {
2435 if (!UniquedVals[Multiple-1].getNode())
2436 return DAG.getTargetConstant(Val: ~0U, DL: SDLoc(N), VT: MVT::i32); // -1,-1,-1,undef
2437 int Val =cast<ConstantSDNode>(Val&: UniquedVals[Multiple-1])->getSExtValue();
2438 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
2439 return DAG.getTargetConstant(Val, DL: SDLoc(N), VT: MVT::i32);
2440 }
2441
2442 return SDValue();
2443 }
2444
2445 // Check to see if this buildvec has a single non-undef value in its elements.
2446 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2447 if (N->getOperand(Num: i).isUndef()) continue;
2448 if (!OpVal.getNode())
2449 OpVal = N->getOperand(Num: i);
2450 else if (OpVal != N->getOperand(Num: i))
2451 return SDValue();
2452 }
2453
2454 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
2455
2456 unsigned ValSizeInBytes = EltSize;
2457 uint64_t Value = 0;
2458 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: OpVal)) {
2459 Value = CN->getZExtValue();
2460 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Val&: OpVal)) {
2461 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
2462 Value = llvm::bit_cast<uint32_t>(from: CN->getValueAPF().convertToFloat());
2463 }
2464
2465 // If the splat value is larger than the element value, then we can never do
2466 // this splat. The only case that we could fit the replicated bits into our
2467 // immediate field for would be zero, and we prefer to use vxor for it.
2468 if (ValSizeInBytes < ByteSize) return SDValue();
2469
2470 // If the element value is larger than the splat value, check if it consists
2471 // of a repeated bit pattern of size ByteSize.
2472 if (!APInt(ValSizeInBytes * 8, Value).isSplat(SplatSizeInBits: ByteSize * 8))
2473 return SDValue();
2474
2475 // Properly sign extend the value.
2476 int MaskVal = SignExtend32(X: Value, B: ByteSize * 8);
2477
2478 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
2479 if (MaskVal == 0) return SDValue();
2480
2481 // Finally, if this value fits in a 5 bit sext field, return it
2482 if (SignExtend32<5>(X: MaskVal) == MaskVal)
2483 return DAG.getSignedTargetConstant(Val: MaskVal, DL: SDLoc(N), VT: MVT::i32);
2484 return SDValue();
2485}
2486
2487//===----------------------------------------------------------------------===//
2488// Addressing Mode Selection
2489//===----------------------------------------------------------------------===//
2490
2491/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
2492/// or 64-bit immediate, and if the value can be accurately represented as a
2493/// sign extension from a 16-bit value. If so, this returns true and the
2494/// immediate.
2495bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
2496 if (!isa<ConstantSDNode>(Val: N))
2497 return false;
2498
2499 Imm = (int16_t)N->getAsZExtVal();
2500 if (N->getValueType(ResNo: 0) == MVT::i32)
2501 return Imm == (int32_t)N->getAsZExtVal();
2502 else
2503 return Imm == (int64_t)N->getAsZExtVal();
2504}
2505bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
2506 return isIntS16Immediate(N: Op.getNode(), Imm);
2507}
2508
2509/// Used when computing address flags for selecting loads and stores.
2510/// If we have an OR, check if the LHS and RHS are provably disjoint.
2511/// An OR of two provably disjoint values is equivalent to an ADD.
2512/// Most PPC load/store instructions compute the effective address as a sum,
2513/// so doing this conversion is useful.
2514static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
2515 if (N.getOpcode() != ISD::OR)
2516 return false;
2517 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2518 if (!LHSKnown.Zero.getBoolValue())
2519 return false;
2520 KnownBits RHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 1));
2521 return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);
2522}
2523
2524/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
2525/// be represented as an indexed [r+r] operation.
2526bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
2527 SDValue &Index,
2528 SelectionDAG &DAG) const {
2529 for (SDNode *U : N->users()) {
2530 if (MemSDNode *Memop = dyn_cast<MemSDNode>(Val: U)) {
2531 if (Memop->getMemoryVT() == MVT::f64) {
2532 Base = N.getOperand(i: 0);
2533 Index = N.getOperand(i: 1);
2534 return true;
2535 }
2536 }
2537 }
2538 return false;
2539}
2540
2541/// isIntS34Immediate - This method tests if value of node given can be
2542/// accurately represented as a sign extension from a 34-bit value. If so,
2543/// this returns true and the immediate.
2544bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
2545 if (!isa<ConstantSDNode>(Val: N))
2546 return false;
2547
2548 Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
2549 return isInt<34>(x: Imm);
2550}
2551bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
2552 return isIntS34Immediate(N: Op.getNode(), Imm);
2553}
2554
2555/// SelectAddressRegReg - Given the specified addressed, check to see if it
2556/// can be represented as an indexed [r+r] operation. Returns false if it
2557/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
2558/// non-zero and N can be represented by a base register plus a signed 16-bit
2559/// displacement, make a more precise judgement by checking (displacement % \p
2560/// EncodingAlignment).
2561bool PPCTargetLowering::SelectAddressRegReg(
2562 SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
2563 MaybeAlign EncodingAlignment) const {
2564 // If we have a PC Relative target flag don't select as [reg+reg]. It will be
2565 // a [pc+imm].
2566 if (SelectAddressPCRel(N, Base))
2567 return false;
2568
2569 int16_t Imm = 0;
2570 if (N.getOpcode() == ISD::ADD) {
2571 // Is there any SPE load/store (f64), which can't handle 16bit offset?
2572 // SPE load/store can only handle 8-bit offsets.
2573 if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
2574 return true;
2575 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm) &&
2576 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm)))
2577 return false; // r+i
2578 if (N.getOperand(i: 1).getOpcode() == PPCISD::Lo)
2579 return false; // r+i
2580
2581 Base = N.getOperand(i: 0);
2582 Index = N.getOperand(i: 1);
2583 return true;
2584 } else if (N.getOpcode() == ISD::OR) {
2585 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm) &&
2586 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm)))
2587 return false; // r+i can fold it if we can.
2588
2589 // If this is an or of disjoint bitfields, we can codegen this as an add
2590 // (for better address arithmetic) if the LHS and RHS of the OR are provably
2591 // disjoint.
2592 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2593
2594 if (LHSKnown.Zero.getBoolValue()) {
2595 KnownBits RHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 1));
2596 // If all of the bits are known zero on the LHS or RHS, the add won't
2597 // carry.
2598 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
2599 Base = N.getOperand(i: 0);
2600 Index = N.getOperand(i: 1);
2601 return true;
2602 }
2603 }
2604 }
2605
2606 return false;
2607}
2608
2609// If we happen to be doing an i64 load or store into a stack slot that has
2610// less than a 4-byte alignment, then the frame-index elimination may need to
2611// use an indexed load or store instruction (because the offset may not be a
2612// multiple of 4). The extra register needed to hold the offset comes from the
2613// register scavenger, and it is possible that the scavenger will need to use
2614// an emergency spill slot. As a result, we need to make sure that a spill slot
2615// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
2616// stack slot.
2617static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
2618 // FIXME: This does not handle the LWA case.
2619 if (VT != MVT::i64)
2620 return;
2621
2622 // NOTE: We'll exclude negative FIs here, which come from argument
2623 // lowering, because there are no known test cases triggering this problem
2624 // using packed structures (or similar). We can remove this exclusion if
2625 // we find such a test case. The reason why this is so test-case driven is
2626 // because this entire 'fixup' is only to prevent crashes (from the
2627 // register scavenger) on not-really-valid inputs. For example, if we have:
2628 // %a = alloca i1
2629 // %b = bitcast i1* %a to i64*
2630 // store i64* a, i64 b
2631 // then the store should really be marked as 'align 1', but is not. If it
2632 // were marked as 'align 1' then the indexed form would have been
2633 // instruction-selected initially, and the problem this 'fixup' is preventing
2634 // won't happen regardless.
2635 if (FrameIdx < 0)
2636 return;
2637
2638 MachineFunction &MF = DAG.getMachineFunction();
2639 MachineFrameInfo &MFI = MF.getFrameInfo();
2640
2641 if (MFI.getObjectAlign(ObjectIdx: FrameIdx) >= Align(4))
2642 return;
2643
2644 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
2645 FuncInfo->setHasNonRISpills();
2646}
2647
2648/// Returns true if the address N can be represented by a base register plus
2649/// a signed 16-bit displacement [r+imm], and if it is not better
2650/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
2651/// displacements that are multiples of that value.
2652bool PPCTargetLowering::SelectAddressRegImm(
2653 SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
2654 MaybeAlign EncodingAlignment) const {
2655 // FIXME dl should come from parent load or store, not from address
2656 SDLoc dl(N);
2657
2658 // If we have a PC Relative target flag don't select as [reg+imm]. It will be
2659 // a [pc+imm].
2660 if (SelectAddressPCRel(N, Base))
2661 return false;
2662
2663 // If this can be more profitably realized as r+r, fail.
2664 if (SelectAddressRegReg(N, Base&: Disp, Index&: Base, DAG, EncodingAlignment))
2665 return false;
2666
2667 if (N.getOpcode() == ISD::ADD) {
2668 int16_t imm = 0;
2669 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) &&
2670 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: imm))) {
2671 Disp = DAG.getSignedTargetConstant(Val: imm, DL: dl, VT: N.getValueType());
2672 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0))) {
2673 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2674 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2675 } else {
2676 Base = N.getOperand(i: 0);
2677 }
2678 return true; // [r+i]
2679 } else if (N.getOperand(i: 1).getOpcode() == PPCISD::Lo) {
2680 // Match LOAD (ADD (X, Lo(G))).
2681 assert(!N.getOperand(1).getConstantOperandVal(1) &&
2682 "Cannot handle constant offsets yet!");
2683 Disp = N.getOperand(i: 1).getOperand(i: 0); // The global address.
2684 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
2685 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
2686 Disp.getOpcode() == ISD::TargetConstantPool ||
2687 Disp.getOpcode() == ISD::TargetJumpTable);
2688 Base = N.getOperand(i: 0);
2689 return true; // [&g+r]
2690 }
2691 } else if (N.getOpcode() == ISD::OR) {
2692 int16_t imm = 0;
2693 if (isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) &&
2694 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: imm))) {
2695 // If this is an or of disjoint bitfields, we can codegen this as an add
2696 // (for better address arithmetic) if the LHS and RHS of the OR are
2697 // provably disjoint.
2698 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2699
2700 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
2701 // If all of the bits are known zero on the LHS or RHS, the add won't
2702 // carry.
2703 if (FrameIndexSDNode *FI =
2704 dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0))) {
2705 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2706 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2707 } else {
2708 Base = N.getOperand(i: 0);
2709 }
2710 Disp = DAG.getTargetConstant(Val: imm, DL: dl, VT: N.getValueType());
2711 return true;
2712 }
2713 }
2714 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: N)) {
2715 // Loading from a constant address.
2716
2717 // If this address fits entirely in a 16-bit sext immediate field, codegen
2718 // this as "d, 0"
2719 int16_t Imm;
2720 if (isIntS16Immediate(N: CN, Imm) &&
2721 (!EncodingAlignment || isAligned(Lhs: *EncodingAlignment, SizeInBytes: Imm))) {
2722 Disp = DAG.getTargetConstant(Val: Imm, DL: dl, VT: CN->getValueType(ResNo: 0));
2723 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2724 VT: CN->getValueType(ResNo: 0));
2725 return true;
2726 }
2727
2728 // Handle 32-bit sext immediates with LIS + addr mode.
2729 if ((CN->getValueType(ResNo: 0) == MVT::i32 ||
2730 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
2731 (!EncodingAlignment ||
2732 isAligned(Lhs: *EncodingAlignment, SizeInBytes: CN->getZExtValue()))) {
2733 int Addr = (int)CN->getZExtValue();
2734
2735 // Otherwise, break this down into an LIS + disp.
2736 Disp = DAG.getTargetConstant(Val: (short)Addr, DL: dl, VT: MVT::i32);
2737
2738 Base = DAG.getTargetConstant(Val: (Addr - (signed short)Addr) >> 16, DL: dl,
2739 VT: MVT::i32);
2740 unsigned Opc = CN->getValueType(ResNo: 0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
2741 Base = SDValue(DAG.getMachineNode(Opcode: Opc, dl, VT: CN->getValueType(ResNo: 0), Op1: Base), 0);
2742 return true;
2743 }
2744 }
2745
2746 Disp = DAG.getTargetConstant(Val: 0, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()));
2747 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N)) {
2748 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2749 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
2750 } else
2751 Base = N;
2752 return true; // [r+0]
2753}
2754
2755/// Similar to the 16-bit case but for instructions that take a 34-bit
2756/// displacement field (prefixed loads/stores).
2757bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
2758 SDValue &Base,
2759 SelectionDAG &DAG) const {
2760 // Only on 64-bit targets.
2761 if (N.getValueType() != MVT::i64)
2762 return false;
2763
2764 SDLoc dl(N);
2765 int64_t Imm = 0;
2766
2767 if (N.getOpcode() == ISD::ADD) {
2768 if (!isIntS34Immediate(Op: N.getOperand(i: 1), Imm))
2769 return false;
2770 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2771 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
2772 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2773 else
2774 Base = N.getOperand(i: 0);
2775 return true;
2776 }
2777
2778 if (N.getOpcode() == ISD::OR) {
2779 if (!isIntS34Immediate(Op: N.getOperand(i: 1), Imm))
2780 return false;
2781 // If this is an or of disjoint bitfields, we can codegen this as an add
2782 // (for better address arithmetic) if the LHS and RHS of the OR are
2783 // provably disjoint.
2784 KnownBits LHSKnown = DAG.computeKnownBits(Op: N.getOperand(i: 0));
2785 if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
2786 return false;
2787 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
2788 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
2789 else
2790 Base = N.getOperand(i: 0);
2791 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2792 return true;
2793 }
2794
2795 if (isIntS34Immediate(Op: N, Imm)) { // If the address is a 34-bit const.
2796 Disp = DAG.getSignedTargetConstant(Val: Imm, DL: dl, VT: N.getValueType());
2797 Base = DAG.getRegister(Reg: PPC::ZERO8, VT: N.getValueType());
2798 return true;
2799 }
2800
2801 return false;
2802}
2803
2804/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
2805/// represented as an indexed [r+r] operation.
2806bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
2807 SDValue &Index,
2808 SelectionDAG &DAG) const {
2809 // Check to see if we can easily represent this as an [r+r] address. This
2810 // will fail if it thinks that the address is more profitably represented as
2811 // reg+imm, e.g. where imm = 0.
2812 if (SelectAddressRegReg(N, Base, Index, DAG))
2813 return true;
2814
2815 // If the address is the result of an add, we will utilize the fact that the
2816 // address calculation includes an implicit add. However, we can reduce
2817 // register pressure if we do not materialize a constant just for use as the
2818 // index register. We only get rid of the add if it is not an add of a
2819 // value and a 16-bit signed constant and both have a single use.
2820 int16_t imm = 0;
2821 if (N.getOpcode() == ISD::ADD &&
2822 (!isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: imm) ||
2823 !N.getOperand(i: 1).hasOneUse() || !N.getOperand(i: 0).hasOneUse())) {
2824 Base = N.getOperand(i: 0);
2825 Index = N.getOperand(i: 1);
2826 return true;
2827 }
2828
2829 // Otherwise, do it the hard way, using R0 as the base register.
2830 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2831 VT: N.getValueType());
2832 Index = N;
2833 return true;
2834}
2835
2836template <typename Ty> static bool isValidPCRelNode(SDValue N) {
2837 Ty *PCRelCand = dyn_cast<Ty>(N);
2838 return PCRelCand && (PPCInstrInfo::hasPCRelFlag(TF: PCRelCand->getTargetFlags()));
2839}
2840
2841/// Returns true if this address is a PC Relative address.
2842/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
2843/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
2844bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
2845 // This is a materialize PC Relative node. Always select this as PC Relative.
2846 Base = N;
2847 if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
2848 return true;
2849 if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
2850 isValidPCRelNode<GlobalAddressSDNode>(N) ||
2851 isValidPCRelNode<JumpTableSDNode>(N) ||
2852 isValidPCRelNode<BlockAddressSDNode>(N))
2853 return true;
2854 return false;
2855}
2856
2857/// Returns true if we should use a direct load into vector instruction
2858/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
2859static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
2860
2861 // If there are any other uses other than scalar to vector, then we should
2862 // keep it as a scalar load -> direct move pattern to prevent multiple
2863 // loads.
2864 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N);
2865 if (!LD)
2866 return false;
2867
2868 EVT MemVT = LD->getMemoryVT();
2869 if (!MemVT.isSimple())
2870 return false;
2871 switch(MemVT.getSimpleVT().SimpleTy) {
2872 case MVT::i64:
2873 break;
2874 case MVT::i32:
2875 if (!ST.hasP8Vector())
2876 return false;
2877 break;
2878 case MVT::i16:
2879 case MVT::i8:
2880 if (!ST.hasP9Vector())
2881 return false;
2882 break;
2883 default:
2884 return false;
2885 }
2886
2887 SDValue LoadedVal(N, 0);
2888 if (!LoadedVal.hasOneUse())
2889 return false;
2890
2891 for (SDUse &Use : LD->uses())
2892 if (Use.getResNo() == 0 &&
2893 Use.getUser()->getOpcode() != ISD::SCALAR_TO_VECTOR &&
2894 Use.getUser()->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
2895 return false;
2896
2897 return true;
2898}
2899
2900/// getPreIndexedAddressParts - returns true by value, base pointer and
2901/// offset pointer and addressing mode by reference if the node's address
2902/// can be legally represented as pre-indexed load / store address.
2903bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
2904 SDValue &Offset,
2905 ISD::MemIndexedMode &AM,
2906 SelectionDAG &DAG) const {
2907 if (DisablePPCPreinc) return false;
2908
2909 bool isLoad = true;
2910 SDValue Ptr;
2911 EVT VT;
2912 Align Alignment;
2913 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
2914 Ptr = LD->getBasePtr();
2915 VT = LD->getMemoryVT();
2916 Alignment = LD->getAlign();
2917 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
2918 Ptr = ST->getBasePtr();
2919 VT = ST->getMemoryVT();
2920 Alignment = ST->getAlign();
2921 isLoad = false;
2922 } else
2923 return false;
2924
2925 // Do not generate pre-inc forms for specific loads that feed scalar_to_vector
2926 // instructions because we can fold these into a more efficient instruction
2927 // instead, (such as LXSD).
2928 if (isLoad && usePartialVectorLoads(N, ST: Subtarget)) {
2929 return false;
2930 }
2931
2932 // PowerPC doesn't have preinc load/store instructions for vectors
2933 if (VT.isVector())
2934 return false;
2935
2936 if (SelectAddressRegReg(N: Ptr, Base, Index&: Offset, DAG)) {
2937 // Common code will reject creating a pre-inc form if the base pointer
2938 // is a frame index, or if N is a store and the base pointer is either
2939 // the same as or a predecessor of the value being stored. Check for
2940 // those situations here, and try with swapped Base/Offset instead.
2941 bool Swap = false;
2942
2943 if (isa<FrameIndexSDNode>(Val: Base) || isa<RegisterSDNode>(Val: Base))
2944 Swap = true;
2945 else if (!isLoad) {
2946 SDValue Val = cast<StoreSDNode>(Val: N)->getValue();
2947 if (Val == Base || Base.getNode()->isPredecessorOf(N: Val.getNode()))
2948 Swap = true;
2949 }
2950
2951 if (Swap)
2952 std::swap(a&: Base, b&: Offset);
2953
2954 AM = ISD::PRE_INC;
2955 return true;
2956 }
2957
2958 // LDU/STU can only handle immediates that are a multiple of 4.
2959 if (VT != MVT::i64) {
2960 if (!SelectAddressRegImm(N: Ptr, Disp&: Offset, Base, DAG, EncodingAlignment: std::nullopt))
2961 return false;
2962 } else {
2963 // LDU/STU need an address with at least 4-byte alignment.
2964 if (Alignment < Align(4))
2965 return false;
2966
2967 if (!SelectAddressRegImm(N: Ptr, Disp&: Offset, Base, DAG, EncodingAlignment: Align(4)))
2968 return false;
2969 }
2970
2971 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
2972 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
2973 // sext i32 to i64 when addr mode is r+i.
2974 if (LD->getValueType(ResNo: 0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
2975 LD->getExtensionType() == ISD::SEXTLOAD &&
2976 isa<ConstantSDNode>(Val: Offset))
2977 return false;
2978 }
2979
2980 AM = ISD::PRE_INC;
2981 return true;
2982}
2983
2984//===----------------------------------------------------------------------===//
2985// LowerOperation implementation
2986//===----------------------------------------------------------------------===//
2987
2988/// Return true if we should reference labels using a PICBase, set the HiOpFlags
2989/// and LoOpFlags to the target MO flags.
2990static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
2991 unsigned &HiOpFlags, unsigned &LoOpFlags,
2992 const GlobalValue *GV = nullptr) {
2993 HiOpFlags = PPCII::MO_HA;
2994 LoOpFlags = PPCII::MO_LO;
2995
2996 // Don't use the pic base if not in PIC relocation model.
2997 if (IsPIC) {
2998 HiOpFlags = PPCII::MO_PIC_HA_FLAG;
2999 LoOpFlags = PPCII::MO_PIC_LO_FLAG;
3000 }
3001}
3002
3003static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
3004 SelectionDAG &DAG) {
3005 SDLoc DL(HiPart);
3006 EVT PtrVT = HiPart.getValueType();
3007 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: PtrVT);
3008
3009 SDValue Hi = DAG.getNode(Opcode: PPCISD::Hi, DL, VT: PtrVT, N1: HiPart, N2: Zero);
3010 SDValue Lo = DAG.getNode(Opcode: PPCISD::Lo, DL, VT: PtrVT, N1: LoPart, N2: Zero);
3011
3012 // With PIC, the first instruction is actually "GR+hi(&G)".
3013 if (isPIC)
3014 Hi = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT,
3015 N1: DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL, VT: PtrVT), N2: Hi);
3016
3017 // Generate non-pic code that has direct accesses to the constant pool.
3018 // The address of the global is just (hi(&g)+lo(&g)).
3019 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Hi, N2: Lo);
3020}
3021
3022static void setUsesTOCBasePtr(MachineFunction &MF) {
3023 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3024 FuncInfo->setUsesTOCBasePtr();
3025}
3026
3027static void setUsesTOCBasePtr(SelectionDAG &DAG) {
3028 setUsesTOCBasePtr(DAG.getMachineFunction());
3029}
3030
3031SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
3032 SDValue GA) const {
3033 EVT VT = Subtarget.getScalarIntVT();
3034 SDValue Reg = Subtarget.isPPC64() ? DAG.getRegister(Reg: PPC::X2, VT)
3035 : Subtarget.isAIXABI()
3036 ? DAG.getRegister(Reg: PPC::R2, VT)
3037 : DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT);
3038 SDValue Ops[] = { GA, Reg };
3039 return DAG.getMemIntrinsicNode(
3040 Opcode: PPCISD::TOC_ENTRY, dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Other), Ops, MemVT: VT,
3041 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()), Alignment: std::nullopt,
3042 Flags: MachineMemOperand::MOLoad);
3043}
3044
3045SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
3046 SelectionDAG &DAG) const {
3047 EVT PtrVT = Op.getValueType();
3048 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
3049 const Constant *C = CP->getConstVal();
3050
3051 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3052 // The actual address of the GlobalValue is stored in the TOC.
3053 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3054 if (Subtarget.isUsingPCRelativeCalls()) {
3055 SDLoc DL(CP);
3056 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3057 SDValue ConstPool = DAG.getTargetConstantPool(
3058 C, VT: Ty, Align: CP->getAlign(), Offset: CP->getOffset(), TargetFlags: PPCII::MO_PCREL_FLAG);
3059 return DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: ConstPool);
3060 }
3061 setUsesTOCBasePtr(DAG);
3062 SDValue GA = DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0);
3063 return getTOCEntry(DAG, dl: SDLoc(CP), GA);
3064 }
3065
3066 unsigned MOHiFlag, MOLoFlag;
3067 bool IsPIC = isPositionIndependent();
3068 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3069
3070 if (IsPIC && Subtarget.isSVR4ABI()) {
3071 SDValue GA =
3072 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: PPCII::MO_PIC_FLAG);
3073 return getTOCEntry(DAG, dl: SDLoc(CP), GA);
3074 }
3075
3076 SDValue CPIHi =
3077 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0, TargetFlags: MOHiFlag);
3078 SDValue CPILo =
3079 DAG.getTargetConstantPool(C, VT: PtrVT, Align: CP->getAlign(), Offset: 0, TargetFlags: MOLoFlag);
3080 return LowerLabelRef(HiPart: CPIHi, LoPart: CPILo, isPIC: IsPIC, DAG);
3081}
3082
3083// For 64-bit PowerPC, prefer the more compact relative encodings.
3084// This trades 32 bits per jump table entry for one or two instructions
3085// on the jump site.
3086unsigned PPCTargetLowering::getJumpTableEncoding() const {
3087 if (isJumpTableRelative())
3088 return MachineJumpTableInfo::EK_LabelDifference32;
3089
3090 return TargetLowering::getJumpTableEncoding();
3091}
3092
3093bool PPCTargetLowering::isJumpTableRelative() const {
3094 if (UseAbsoluteJumpTables)
3095 return false;
3096 if (Subtarget.isPPC64() || Subtarget.isAIXABI())
3097 return true;
3098 return TargetLowering::isJumpTableRelative();
3099}
3100
3101SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
3102 SelectionDAG &DAG) const {
3103 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3104 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3105
3106 switch (getTargetMachine().getCodeModel()) {
3107 case CodeModel::Small:
3108 case CodeModel::Medium:
3109 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3110 default:
3111 return DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: SDLoc(),
3112 VT: getPointerTy(DL: DAG.getDataLayout()));
3113 }
3114}
3115
3116const MCExpr *
3117PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
3118 unsigned JTI,
3119 MCContext &Ctx) const {
3120 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3121 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3122
3123 switch (getTargetMachine().getCodeModel()) {
3124 case CodeModel::Small:
3125 case CodeModel::Medium:
3126 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3127 default:
3128 return MCSymbolRefExpr::create(Symbol: MF->getPICBaseSymbol(), Ctx);
3129 }
3130}
3131
3132SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
3133 EVT PtrVT = Op.getValueType();
3134 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
3135
3136 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3137 if (Subtarget.isUsingPCRelativeCalls()) {
3138 SDLoc DL(JT);
3139 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3140 SDValue GA =
3141 DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: Ty, TargetFlags: PPCII::MO_PCREL_FLAG);
3142 SDValue MatAddr = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3143 return MatAddr;
3144 }
3145
3146 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3147 // The actual address of the GlobalValue is stored in the TOC.
3148 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3149 setUsesTOCBasePtr(DAG);
3150 SDValue GA = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT);
3151 return getTOCEntry(DAG, dl: SDLoc(JT), GA);
3152 }
3153
3154 unsigned MOHiFlag, MOLoFlag;
3155 bool IsPIC = isPositionIndependent();
3156 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3157
3158 if (IsPIC && Subtarget.isSVR4ABI()) {
3159 SDValue GA = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT,
3160 TargetFlags: PPCII::MO_PIC_FLAG);
3161 return getTOCEntry(DAG, dl: SDLoc(GA), GA);
3162 }
3163
3164 SDValue JTIHi = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT, TargetFlags: MOHiFlag);
3165 SDValue JTILo = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PtrVT, TargetFlags: MOLoFlag);
3166 return LowerLabelRef(HiPart: JTIHi, LoPart: JTILo, isPIC: IsPIC, DAG);
3167}
3168
3169SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
3170 SelectionDAG &DAG) const {
3171 EVT PtrVT = Op.getValueType();
3172 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Val&: Op);
3173 const BlockAddress *BA = BASDN->getBlockAddress();
3174
3175 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3176 if (Subtarget.isUsingPCRelativeCalls()) {
3177 SDLoc DL(BASDN);
3178 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3179 SDValue GA = DAG.getTargetBlockAddress(BA, VT: Ty, Offset: BASDN->getOffset(),
3180 TargetFlags: PPCII::MO_PCREL_FLAG);
3181 SDValue MatAddr = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3182 return MatAddr;
3183 }
3184
3185 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3186 // The actual BlockAddress is stored in the TOC.
3187 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3188 setUsesTOCBasePtr(DAG);
3189 SDValue GA = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: BASDN->getOffset());
3190 return getTOCEntry(DAG, dl: SDLoc(BASDN), GA);
3191 }
3192
3193 // 32-bit position-independent ELF stores the BlockAddress in the .got.
3194 if (Subtarget.is32BitELFABI() && isPositionIndependent())
3195 return getTOCEntry(
3196 DAG, dl: SDLoc(BASDN),
3197 GA: DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: BASDN->getOffset()));
3198
3199 unsigned MOHiFlag, MOLoFlag;
3200 bool IsPIC = isPositionIndependent();
3201 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag);
3202 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: 0, TargetFlags: MOHiFlag);
3203 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, VT: PtrVT, Offset: 0, TargetFlags: MOLoFlag);
3204 return LowerLabelRef(HiPart: TgtBAHi, LoPart: TgtBALo, isPIC: IsPIC, DAG);
3205}
3206
3207SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
3208 SelectionDAG &DAG) const {
3209 if (Subtarget.isAIXABI())
3210 return LowerGlobalTLSAddressAIX(Op, DAG);
3211
3212 return LowerGlobalTLSAddressLinux(Op, DAG);
3213}
3214
3215/// updateForAIXShLibTLSModelOpt - Helper to initialize TLS model opt settings,
3216/// and then apply the update.
3217static void updateForAIXShLibTLSModelOpt(TLSModel::Model &Model,
3218 SelectionDAG &DAG,
3219 const TargetMachine &TM) {
3220 // Initialize TLS model opt setting lazily:
3221 // (1) Use initial-exec for single TLS var references within current function.
3222 // (2) Use local-dynamic for multiple TLS var references within current
3223 // function.
3224 PPCFunctionInfo *FuncInfo =
3225 DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
3226 if (!FuncInfo->isAIXFuncTLSModelOptInitDone()) {
3227 SmallPtrSet<const GlobalValue *, 8> TLSGV;
3228 // Iterate over all instructions within current function, collect all TLS
3229 // global variables (global variables taken as the first parameter to
3230 // Intrinsic::threadlocal_address).
3231 const Function &Func = DAG.getMachineFunction().getFunction();
3232 for (const BasicBlock &BB : Func)
3233 for (const Instruction &I : BB)
3234 if (I.getOpcode() == Instruction::Call)
3235 if (const CallInst *CI = dyn_cast<const CallInst>(Val: &I))
3236 if (Function *CF = CI->getCalledFunction())
3237 if (CF->isDeclaration() &&
3238 CF->getIntrinsicID() == Intrinsic::threadlocal_address)
3239 if (const GlobalValue *GV =
3240 dyn_cast<GlobalValue>(Val: I.getOperand(i: 0))) {
3241 TLSModel::Model GVModel = TM.getTLSModel(GV);
3242 if (GVModel == TLSModel::LocalDynamic)
3243 TLSGV.insert(Ptr: GV);
3244 }
3245
3246 unsigned TLSGVCnt = TLSGV.size();
3247 LLVM_DEBUG(dbgs() << format("LocalDynamic TLSGV count:%d\n", TLSGVCnt));
3248 if (TLSGVCnt <= PPCAIXTLSModelOptUseIEForLDLimit)
3249 FuncInfo->setAIXFuncUseTLSIEForLD();
3250 FuncInfo->setAIXFuncTLSModelOptInitDone();
3251 }
3252
3253 if (FuncInfo->isAIXFuncUseTLSIEForLD()) {
3254 LLVM_DEBUG(
3255 dbgs() << DAG.getMachineFunction().getName()
3256 << " function is using the TLS-IE model for TLS-LD access.\n");
3257 Model = TLSModel::InitialExec;
3258 }
3259}
3260
3261SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
3262 SelectionDAG &DAG) const {
3263 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3264
3265 if (DAG.getTarget().useEmulatedTLS())
3266 report_fatal_error(reason: "Emulated TLS is not yet supported on AIX");
3267
3268 SDLoc dl(GA);
3269 const GlobalValue *GV = GA->getGlobal();
3270 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3271 bool Is64Bit = Subtarget.isPPC64();
3272 TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
3273
3274 // Apply update to the TLS model.
3275 if (Subtarget.hasAIXShLibTLSModelOpt())
3276 updateForAIXShLibTLSModelOpt(Model, DAG, TM: getTargetMachine());
3277
3278 // TLS variables are accessed through TOC entries.
3279 // To support this, set the DAG to use the TOC base pointer.
3280 setUsesTOCBasePtr(DAG);
3281
3282 bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
3283
3284 if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
3285 bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
3286 bool HasAIXSmallTLSGlobalAttr = false;
3287 SDValue VariableOffsetTGA =
3288 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TPREL_FLAG);
3289 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3290 SDValue TLSReg;
3291
3292 if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(Val: GV))
3293 if (GVar->hasAttribute(Kind: "aix-small-tls"))
3294 HasAIXSmallTLSGlobalAttr = true;
3295
3296 if (Is64Bit) {
3297 // For local-exec and initial-exec on AIX (64-bit), the sequence generated
3298 // involves a load of the variable offset (from the TOC), followed by an
3299 // add of the loaded variable offset to R13 (the thread pointer).
3300 // This code sequence looks like:
3301 // ld reg1,var[TC](2)
3302 // add reg2, reg1, r13 // r13 contains the thread pointer
3303 TLSReg = DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
3304
3305 // With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
3306 // global variable attribute, produce a faster access sequence for
3307 // local-exec TLS variables where the offset from the TLS base is encoded
3308 // as an immediate operand.
3309 //
3310 // We only utilize the faster local-exec access sequence when the TLS
3311 // variable has a size within the policy limit. We treat types that are
3312 // not sized or are empty as being over the policy size limit.
3313 if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
3314 IsTLSLocalExecModel) {
3315 Type *GVType = GV->getValueType();
3316 if (GVType->isSized() && !GVType->isEmptyTy() &&
3317 GV->getDataLayout().getTypeAllocSize(Ty: GVType) <=
3318 AIXSmallTlsPolicySizeLimit)
3319 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: VariableOffsetTGA, N2: TLSReg);
3320 }
3321 } else {
3322 // For local-exec and initial-exec on AIX (32-bit), the sequence generated
3323 // involves loading the variable offset from the TOC, generating a call to
3324 // .__get_tpointer to get the thread pointer (which will be in R3), and
3325 // adding the two together:
3326 // lwz reg1,var[TC](2)
3327 // bla .__get_tpointer
3328 // add reg2, reg1, r3
3329 TLSReg = DAG.getNode(Opcode: PPCISD::GET_TPOINTER, DL: dl, VT: PtrVT);
3330
3331 // We do not implement the 32-bit version of the faster access sequence
3332 // for local-exec that is controlled by the -maix-small-local-exec-tls
3333 // option, or the "aix-small-tls" global variable attribute.
3334 if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
3335 report_fatal_error(reason: "The small-local-exec TLS access sequence is "
3336 "currently only supported on AIX (64-bit mode).");
3337 }
3338 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TLSReg, N2: VariableOffset);
3339 }
3340
3341 if (Model == TLSModel::LocalDynamic) {
3342 bool HasAIXSmallLocalDynamicTLS = Subtarget.hasAIXSmallLocalDynamicTLS();
3343
3344 // We do not implement the 32-bit version of the faster access sequence
3345 // for local-dynamic that is controlled by -maix-small-local-dynamic-tls.
3346 if (!Is64Bit && HasAIXSmallLocalDynamicTLS)
3347 report_fatal_error(reason: "The small-local-dynamic TLS access sequence is "
3348 "currently only supported on AIX (64-bit mode).");
3349
3350 // For local-dynamic on AIX, we need to generate one TOC entry for each
3351 // variable offset, and a single module-handle TOC entry for the entire
3352 // file.
3353
3354 SDValue VariableOffsetTGA =
3355 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSLD_FLAG);
3356 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3357
3358 Module *M = DAG.getMachineFunction().getFunction().getParent();
3359 GlobalVariable *TLSGV =
3360 dyn_cast_or_null<GlobalVariable>(Val: M->getOrInsertGlobal(
3361 Name: StringRef("_$TLSML"), Ty: PointerType::getUnqual(C&: *DAG.getContext())));
3362 TLSGV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);
3363 assert(TLSGV && "Not able to create GV for _$TLSML.");
3364 SDValue ModuleHandleTGA =
3365 DAG.getTargetGlobalAddress(GV: TLSGV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSLDM_FLAG);
3366 SDValue ModuleHandleTOC = getTOCEntry(DAG, dl, GA: ModuleHandleTGA);
3367 SDValue ModuleHandle =
3368 DAG.getNode(Opcode: PPCISD::TLSLD_AIX, DL: dl, VT: PtrVT, Operand: ModuleHandleTOC);
3369
3370 // With the -maix-small-local-dynamic-tls option, produce a faster access
3371 // sequence for local-dynamic TLS variables where the offset from the
3372 // module-handle is encoded as an immediate operand.
3373 //
3374 // We only utilize the faster local-dynamic access sequence when the TLS
3375 // variable has a size within the policy limit. We treat types that are
3376 // not sized or are empty as being over the policy size limit.
3377 if (HasAIXSmallLocalDynamicTLS) {
3378 Type *GVType = GV->getValueType();
3379 if (GVType->isSized() && !GVType->isEmptyTy() &&
3380 GV->getDataLayout().getTypeAllocSize(Ty: GVType) <=
3381 AIXSmallTlsPolicySizeLimit)
3382 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: VariableOffsetTGA,
3383 N2: ModuleHandle);
3384 }
3385
3386 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: ModuleHandle, N2: VariableOffset);
3387 }
3388
3389 // If Local- or Initial-exec or Local-dynamic is not possible or specified,
3390 // all GlobalTLSAddress nodes are lowered using the general-dynamic model. We
3391 // need to generate two TOC entries, one for the variable offset, one for the
3392 // region handle. The global address for the TOC entry of the region handle is
3393 // created with the MO_TLSGDM_FLAG flag and the global address for the TOC
3394 // entry of the variable offset is created with MO_TLSGD_FLAG.
3395 SDValue VariableOffsetTGA =
3396 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSGD_FLAG);
3397 SDValue RegionHandleTGA =
3398 DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: PPCII::MO_TLSGDM_FLAG);
3399 SDValue VariableOffset = getTOCEntry(DAG, dl, GA: VariableOffsetTGA);
3400 SDValue RegionHandle = getTOCEntry(DAG, dl, GA: RegionHandleTGA);
3401 return DAG.getNode(Opcode: PPCISD::TLSGD_AIX, DL: dl, VT: PtrVT, N1: VariableOffset,
3402 N2: RegionHandle);
3403}
3404
3405SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
3406 SelectionDAG &DAG) const {
3407 // FIXME: TLS addresses currently use medium model code sequences,
3408 // which is the most useful form. Eventually support for small and
3409 // large models could be added if users need it, at the cost of
3410 // additional complexity.
3411 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3412 if (DAG.getTarget().useEmulatedTLS())
3413 return LowerToTLSEmulatedModel(GA, DAG);
3414
3415 SDLoc dl(GA);
3416 const GlobalValue *GV = GA->getGlobal();
3417 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3418 bool is64bit = Subtarget.isPPC64();
3419 const Module *M = DAG.getMachineFunction().getFunction().getParent();
3420 PICLevel::Level picLevel = M->getPICLevel();
3421
3422 const TargetMachine &TM = getTargetMachine();
3423 TLSModel::Model Model = TM.getTLSModel(GV);
3424
3425 if (Model == TLSModel::LocalExec) {
3426 if (Subtarget.isUsingPCRelativeCalls()) {
3427 SDValue TLSReg = DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
3428 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3429 TargetFlags: PPCII::MO_TPREL_PCREL_FLAG);
3430 SDValue MatAddr =
3431 DAG.getNode(Opcode: PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3432 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TLSReg, N2: MatAddr);
3433 }
3434
3435 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3436 TargetFlags: PPCII::MO_TPREL_HA);
3437 SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3438 TargetFlags: PPCII::MO_TPREL_LO);
3439 SDValue TLSReg = is64bit ? DAG.getRegister(Reg: PPC::X13, VT: MVT::i64)
3440 : DAG.getRegister(Reg: PPC::R2, VT: MVT::i32);
3441
3442 SDValue Hi = DAG.getNode(Opcode: PPCISD::Hi, DL: dl, VT: PtrVT, N1: TGAHi, N2: TLSReg);
3443 return DAG.getNode(Opcode: PPCISD::Lo, DL: dl, VT: PtrVT, N1: TGALo, N2: Hi);
3444 }
3445
3446 if (Model == TLSModel::InitialExec) {
3447 bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
3448 SDValue TGA = DAG.getTargetGlobalAddress(
3449 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
3450 SDValue TGATLS = DAG.getTargetGlobalAddress(
3451 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS);
3452 SDValue TPOffset;
3453 if (IsPCRel) {
3454 SDValue MatPCRel = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3455 TPOffset = DAG.getLoad(VT: MVT::i64, dl, Chain: DAG.getEntryNode(), Ptr: MatPCRel,
3456 PtrInfo: MachinePointerInfo());
3457 } else {
3458 SDValue GOTPtr;
3459 if (is64bit) {
3460 setUsesTOCBasePtr(DAG);
3461 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3462 GOTPtr =
3463 DAG.getNode(Opcode: PPCISD::ADDIS_GOT_TPREL_HA, DL: dl, VT: PtrVT, N1: GOTReg, N2: TGA);
3464 } else {
3465 if (!TM.isPositionIndependent())
3466 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_GOT, DL: dl, VT: PtrVT);
3467 else if (picLevel == PICLevel::SmallPIC)
3468 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3469 else
3470 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3471 }
3472 TPOffset = DAG.getNode(Opcode: PPCISD::LD_GOT_TPREL_L, DL: dl, VT: PtrVT, N1: TGA, N2: GOTPtr);
3473 }
3474 return DAG.getNode(Opcode: PPCISD::ADD_TLS, DL: dl, VT: PtrVT, N1: TPOffset, N2: TGATLS);
3475 }
3476
3477 if (Model == TLSModel::GeneralDynamic) {
3478 if (Subtarget.isUsingPCRelativeCalls()) {
3479 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3480 TargetFlags: PPCII::MO_GOT_TLSGD_PCREL_FLAG);
3481 return DAG.getNode(Opcode: PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3482 }
3483
3484 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: 0);
3485 SDValue GOTPtr;
3486 if (is64bit) {
3487 setUsesTOCBasePtr(DAG);
3488 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3489 GOTPtr = DAG.getNode(Opcode: PPCISD::ADDIS_TLSGD_HA, DL: dl, VT: PtrVT,
3490 N1: GOTReg, N2: TGA);
3491 } else {
3492 if (picLevel == PICLevel::SmallPIC)
3493 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3494 else
3495 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3496 }
3497 return DAG.getNode(Opcode: PPCISD::ADDI_TLSGD_L_ADDR, DL: dl, VT: PtrVT,
3498 N1: GOTPtr, N2: TGA, N3: TGA);
3499 }
3500
3501 if (Model == TLSModel::LocalDynamic) {
3502 if (Subtarget.isUsingPCRelativeCalls()) {
3503 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0,
3504 TargetFlags: PPCII::MO_GOT_TLSLD_PCREL_FLAG);
3505 SDValue MatPCRel =
3506 DAG.getNode(Opcode: PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, DL: dl, VT: PtrVT, Operand: TGA);
3507 return DAG.getNode(Opcode: PPCISD::PADDI_DTPREL, DL: dl, VT: PtrVT, N1: MatPCRel, N2: TGA);
3508 }
3509
3510 SDValue TGA = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: 0);
3511 SDValue GOTPtr;
3512 if (is64bit) {
3513 setUsesTOCBasePtr(DAG);
3514 SDValue GOTReg = DAG.getRegister(Reg: PPC::X2, VT: MVT::i64);
3515 GOTPtr = DAG.getNode(Opcode: PPCISD::ADDIS_TLSLD_HA, DL: dl, VT: PtrVT,
3516 N1: GOTReg, N2: TGA);
3517 } else {
3518 if (picLevel == PICLevel::SmallPIC)
3519 GOTPtr = DAG.getNode(Opcode: PPCISD::GlobalBaseReg, DL: dl, VT: PtrVT);
3520 else
3521 GOTPtr = DAG.getNode(Opcode: PPCISD::PPC32_PICGOT, DL: dl, VT: PtrVT);
3522 }
3523 SDValue TLSAddr = DAG.getNode(Opcode: PPCISD::ADDI_TLSLD_L_ADDR, DL: dl,
3524 VT: PtrVT, N1: GOTPtr, N2: TGA, N3: TGA);
3525 SDValue DtvOffsetHi = DAG.getNode(Opcode: PPCISD::ADDIS_DTPREL_HA, DL: dl,
3526 VT: PtrVT, N1: TLSAddr, N2: TGA);
3527 return DAG.getNode(Opcode: PPCISD::ADDI_DTPREL_L, DL: dl, VT: PtrVT, N1: DtvOffsetHi, N2: TGA);
3528 }
3529
3530 llvm_unreachable("Unknown TLS model!");
3531}
3532
3533SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
3534 SelectionDAG &DAG) const {
3535 EVT PtrVT = Op.getValueType();
3536 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Val&: Op);
3537 SDLoc DL(GSDN);
3538 const GlobalValue *GV = GSDN->getGlobal();
3539
3540 // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
3541 // The actual address of the GlobalValue is stored in the TOC.
3542 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3543 if (Subtarget.isUsingPCRelativeCalls()) {
3544 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
3545 if (isAccessedAsGotIndirect(N: Op)) {
3546 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: Ty, offset: GSDN->getOffset(),
3547 TargetFlags: PPCII::MO_GOT_PCREL_FLAG);
3548 SDValue MatPCRel = DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3549 SDValue Load = DAG.getLoad(VT: MVT::i64, dl: DL, Chain: DAG.getEntryNode(), Ptr: MatPCRel,
3550 PtrInfo: MachinePointerInfo());
3551 return Load;
3552 } else {
3553 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: Ty, offset: GSDN->getOffset(),
3554 TargetFlags: PPCII::MO_PCREL_FLAG);
3555 return DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: Ty, Operand: GA);
3556 }
3557 }
3558 setUsesTOCBasePtr(DAG);
3559 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset());
3560 return getTOCEntry(DAG, dl: DL, GA);
3561 }
3562
3563 unsigned MOHiFlag, MOLoFlag;
3564 bool IsPIC = isPositionIndependent();
3565 getLabelAccessInfo(IsPIC, Subtarget, HiOpFlags&: MOHiFlag, LoOpFlags&: MOLoFlag, GV);
3566
3567 if (IsPIC && Subtarget.isSVR4ABI()) {
3568 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT,
3569 offset: GSDN->getOffset(),
3570 TargetFlags: PPCII::MO_PIC_FLAG);
3571 return getTOCEntry(DAG, dl: DL, GA);
3572 }
3573
3574 SDValue GAHi =
3575 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset(), TargetFlags: MOHiFlag);
3576 SDValue GALo =
3577 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: GSDN->getOffset(), TargetFlags: MOLoFlag);
3578
3579 return LowerLabelRef(HiPart: GAHi, LoPart: GALo, isPIC: IsPIC, DAG);
3580}
3581
3582SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
3583 bool IsStrict = Op->isStrictFPOpcode();
3584 ISD::CondCode CC =
3585 cast<CondCodeSDNode>(Val: Op.getOperand(i: IsStrict ? 3 : 2))->get();
3586 SDValue LHS = Op.getOperand(i: IsStrict ? 1 : 0);
3587 SDValue RHS = Op.getOperand(i: IsStrict ? 2 : 1);
3588 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
3589 EVT LHSVT = LHS.getValueType();
3590 SDLoc dl(Op);
3591
3592 // Soften the setcc with libcall if it is fp128.
3593 if (LHSVT == MVT::f128) {
3594 assert(!Subtarget.hasP9Vector() &&
3595 "SETCC for f128 is already legal under Power9!");
3596 softenSetCCOperands(DAG, VT: LHSVT, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS, Chain,
3597 IsSignaling: Op->getOpcode() == ISD::STRICT_FSETCCS);
3598 if (RHS.getNode())
3599 LHS = DAG.getNode(Opcode: ISD::SETCC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS,
3600 N3: DAG.getCondCode(Cond: CC));
3601 if (IsStrict)
3602 return DAG.getMergeValues(Ops: {LHS, Chain}, dl);
3603 return LHS;
3604 }
3605
3606 assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
3607
3608 if (Op.getValueType() == MVT::v2i64) {
3609 // When the operands themselves are v2i64 values, we need to do something
3610 // special because VSX has no underlying comparison operations for these.
3611 if (LHS.getValueType() == MVT::v2i64) {
3612 // Equality can be handled by casting to the legal type for Altivec
3613 // comparisons, everything else needs to be expanded.
3614 if (CC != ISD::SETEQ && CC != ISD::SETNE)
3615 return SDValue();
3616 SDValue SetCC32 = DAG.getSetCC(
3617 DL: dl, VT: MVT::v4i32, LHS: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: LHS),
3618 RHS: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: RHS), Cond: CC);
3619 int ShuffV[] = {1, 0, 3, 2};
3620 SDValue Shuff =
3621 DAG.getVectorShuffle(VT: MVT::v4i32, dl, N1: SetCC32, N2: SetCC32, Mask: ShuffV);
3622 return DAG.getBitcast(VT: MVT::v2i64,
3623 V: DAG.getNode(Opcode: CC == ISD::SETEQ ? ISD::AND : ISD::OR,
3624 DL: dl, VT: MVT::v4i32, N1: Shuff, N2: SetCC32));
3625 }
3626
3627 // We handle most of these in the usual way.
3628 return Op;
3629 }
3630
3631 // If we're comparing for equality to zero, expose the fact that this is
3632 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
3633 // fold the new nodes.
3634 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
3635 return V;
3636
3637 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3638 // Leave comparisons against 0 and -1 alone for now, since they're usually
3639 // optimized. FIXME: revisit this when we can custom lower all setcc
3640 // optimizations.
3641 if (C->isAllOnes() || C->isZero())
3642 return SDValue();
3643 }
3644
3645 // If we have an integer seteq/setne, turn it into a compare against zero
3646 // by xor'ing the rhs with the lhs, which is faster than setting a
3647 // condition register, reading it back out, and masking the correct bit. The
3648 // normal approach here uses sub to do this instead of xor. Using xor exposes
3649 // the result to other bit-twiddling opportunities.
3650 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3651 EVT VT = Op.getValueType();
3652 SDValue Sub = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: LHSVT, N1: LHS, N2: RHS);
3653 return DAG.getSetCC(DL: dl, VT, LHS: Sub, RHS: DAG.getConstant(Val: 0, DL: dl, VT: LHSVT), Cond: CC);
3654 }
3655 return SDValue();
3656}
3657
3658SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3659 SDNode *Node = Op.getNode();
3660 EVT VT = Node->getValueType(ResNo: 0);
3661 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3662 SDValue InChain = Node->getOperand(Num: 0);
3663 SDValue VAListPtr = Node->getOperand(Num: 1);
3664 const Value *SV = cast<SrcValueSDNode>(Val: Node->getOperand(Num: 2))->getValue();
3665 SDLoc dl(Node);
3666
3667 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
3668
3669 // gpr_index
3670 SDValue GprIndex = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i32, Chain: InChain,
3671 Ptr: VAListPtr, PtrInfo: MachinePointerInfo(SV), MemVT: MVT::i8);
3672 InChain = GprIndex.getValue(R: 1);
3673
3674 if (VT == MVT::i64) {
3675 // Check if GprIndex is even
3676 SDValue GprAnd = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: GprIndex,
3677 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3678 SDValue CC64 = DAG.getSetCC(DL: dl, VT: MVT::i32, LHS: GprAnd,
3679 RHS: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32), Cond: ISD::SETNE);
3680 SDValue GprIndexPlusOne = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: GprIndex,
3681 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3682 // Align GprIndex to be even if it isn't
3683 GprIndex = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i32, N1: CC64, N2: GprIndexPlusOne,
3684 N3: GprIndex);
3685 }
3686
3687 // fpr index is 1 byte after gpr
3688 SDValue FprPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3689 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
3690
3691 // fpr
3692 SDValue FprIndex = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i32, Chain: InChain,
3693 Ptr: FprPtr, PtrInfo: MachinePointerInfo(SV), MemVT: MVT::i8);
3694 InChain = FprIndex.getValue(R: 1);
3695
3696 SDValue RegSaveAreaPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3697 N2: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32));
3698
3699 SDValue OverflowAreaPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: VAListPtr,
3700 N2: DAG.getConstant(Val: 4, DL: dl, VT: MVT::i32));
3701
3702 // areas
3703 SDValue OverflowArea =
3704 DAG.getLoad(VT: MVT::i32, dl, Chain: InChain, Ptr: OverflowAreaPtr, PtrInfo: MachinePointerInfo());
3705 InChain = OverflowArea.getValue(R: 1);
3706
3707 SDValue RegSaveArea =
3708 DAG.getLoad(VT: MVT::i32, dl, Chain: InChain, Ptr: RegSaveAreaPtr, PtrInfo: MachinePointerInfo());
3709 InChain = RegSaveArea.getValue(R: 1);
3710
3711 // select overflow_area if index > 8
3712 SDValue CC = DAG.getSetCC(DL: dl, VT: MVT::i32, LHS: VT.isInteger() ? GprIndex : FprIndex,
3713 RHS: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32), Cond: ISD::SETLT);
3714
3715 // adjustment constant gpr_index * 4/8
3716 SDValue RegConstant = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT: MVT::i32,
3717 N1: VT.isInteger() ? GprIndex : FprIndex,
3718 N2: DAG.getConstant(Val: VT.isInteger() ? 4 : 8, DL: dl,
3719 VT: MVT::i32));
3720
3721 // OurReg = RegSaveArea + RegConstant
3722 SDValue OurReg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: RegSaveArea,
3723 N2: RegConstant);
3724
3725 // Floating types are 32 bytes into RegSaveArea
3726 if (VT.isFloatingPoint())
3727 OurReg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: OurReg,
3728 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
3729
3730 // increase {f,g}pr_index by 1 (or 2 if VT is i64)
3731 SDValue IndexPlus1 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32,
3732 N1: VT.isInteger() ? GprIndex : FprIndex,
3733 N2: DAG.getConstant(Val: VT == MVT::i64 ? 2 : 1, DL: dl,
3734 VT: MVT::i32));
3735
3736 InChain = DAG.getTruncStore(Chain: InChain, dl, Val: IndexPlus1,
3737 Ptr: VT.isInteger() ? VAListPtr : FprPtr,
3738 PtrInfo: MachinePointerInfo(SV), SVT: MVT::i8);
3739
3740 // determine if we should load from reg_save_area or overflow_area
3741 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: PtrVT, N1: CC, N2: OurReg, N3: OverflowArea);
3742
3743 // increase overflow_area by 4/8 if gpr/fpr > 8
3744 SDValue OverflowAreaPlusN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: OverflowArea,
3745 N2: DAG.getConstant(Val: VT.isInteger() ? 4 : 8,
3746 DL: dl, VT: MVT::i32));
3747
3748 OverflowArea = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i32, N1: CC, N2: OverflowArea,
3749 N3: OverflowAreaPlusN);
3750
3751 InChain = DAG.getTruncStore(Chain: InChain, dl, Val: OverflowArea, Ptr: OverflowAreaPtr,
3752 PtrInfo: MachinePointerInfo(), SVT: MVT::i32);
3753
3754 return DAG.getLoad(VT, dl, Chain: InChain, Ptr: Result, PtrInfo: MachinePointerInfo());
3755}
3756
3757SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
3758 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
3759
3760 // We have to copy the entire va_list struct:
3761 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
3762 return DAG.getMemcpy(Chain: Op.getOperand(i: 0), dl: Op, Dst: Op.getOperand(i: 1), Src: Op.getOperand(i: 2),
3763 Size: DAG.getConstant(Val: 12, DL: SDLoc(Op), VT: MVT::i32), Alignment: Align(8),
3764 isVol: false, AlwaysInline: true, /*CI=*/nullptr, OverrideTailCall: std::nullopt,
3765 DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
3766}
3767
3768SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
3769 SelectionDAG &DAG) const {
3770 return Op.getOperand(i: 0);
3771}
3772
3773SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
3774 MachineFunction &MF = DAG.getMachineFunction();
3775 PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
3776
3777 assert((Op.getOpcode() == ISD::INLINEASM ||
3778 Op.getOpcode() == ISD::INLINEASM_BR) &&
3779 "Expecting Inline ASM node.");
3780
3781 // If an LR store is already known to be required then there is not point in
3782 // checking this ASM as well.
3783 if (MFI.isLRStoreRequired())
3784 return Op;
3785
3786 // Inline ASM nodes have an optional last operand that is an incoming Flag of
3787 // type MVT::Glue. We want to ignore this last operand if that is the case.
3788 unsigned NumOps = Op.getNumOperands();
3789 if (Op.getOperand(i: NumOps - 1).getValueType() == MVT::Glue)
3790 --NumOps;
3791
3792 // Check all operands that may contain the LR.
3793 for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
3794 const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
3795 unsigned NumVals = Flags.getNumOperandRegisters();
3796 ++i; // Skip the ID value.
3797
3798 switch (Flags.getKind()) {
3799 default:
3800 llvm_unreachable("Bad flags!");
3801 case InlineAsm::Kind::RegUse:
3802 case InlineAsm::Kind::Imm:
3803 case InlineAsm::Kind::Mem:
3804 i += NumVals;
3805 break;
3806 case InlineAsm::Kind::Clobber:
3807 case InlineAsm::Kind::RegDef:
3808 case InlineAsm::Kind::RegDefEarlyClobber: {
3809 for (; NumVals; --NumVals, ++i) {
3810 Register Reg = cast<RegisterSDNode>(Val: Op.getOperand(i))->getReg();
3811 if (Reg != PPC::LR && Reg != PPC::LR8)
3812 continue;
3813 MFI.setLRStoreRequired();
3814 return Op;
3815 }
3816 break;
3817 }
3818 }
3819 }
3820
3821 return Op;
3822}
3823
3824SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
3825 SelectionDAG &DAG) const {
3826 SDValue Chain = Op.getOperand(i: 0);
3827 SDValue Trmp = Op.getOperand(i: 1); // trampoline
3828 SDValue FPtr = Op.getOperand(i: 2); // nested function
3829 SDValue Nest = Op.getOperand(i: 3); // 'nest' parameter value
3830 SDLoc dl(Op);
3831
3832 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3833
3834 if (Subtarget.isAIXABI()) {
3835 // On AIX we create a trampoline descriptor by combining the
3836 // entry point and TOC from the global descriptor (FPtr) with the
3837 // nest argument as the environment pointer.
3838 uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4;
3839 MaybeAlign PointerAlign(PointerSize);
3840 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
3841 ? (MachineMemOperand::MODereferenceable |
3842 MachineMemOperand::MOInvariant)
3843 : MachineMemOperand::MONone;
3844
3845 uint64_t TOCPointerOffset = 1 * PointerSize;
3846 uint64_t EnvPointerOffset = 2 * PointerSize;
3847 SDValue SDTOCPtrOffset = DAG.getConstant(Val: TOCPointerOffset, DL: dl, VT: PtrVT);
3848 SDValue SDEnvPtrOffset = DAG.getConstant(Val: EnvPointerOffset, DL: dl, VT: PtrVT);
3849
3850 const Value *TrampolineAddr =
3851 cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
3852 const Function *Func =
3853 cast<Function>(Val: cast<SrcValueSDNode>(Val: Op.getOperand(i: 5))->getValue());
3854
3855 SDValue OutChains[3];
3856
3857 // Copy the entry point address from the global descriptor to the
3858 // trampoline buffer.
3859 SDValue LoadEntryPoint =
3860 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: FPtr, PtrInfo: MachinePointerInfo(Func, 0),
3861 Alignment: PointerAlign, MMOFlags);
3862 SDValue EPLoadChain = LoadEntryPoint.getValue(R: 1);
3863 OutChains[0] = DAG.getStore(Chain: EPLoadChain, dl, Val: LoadEntryPoint, Ptr: Trmp,
3864 PtrInfo: MachinePointerInfo(TrampolineAddr, 0));
3865
3866 // Copy the TOC pointer from the global descriptor to the trampoline
3867 // buffer.
3868 SDValue TOCFromDescriptorPtr =
3869 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FPtr, N2: SDTOCPtrOffset);
3870 SDValue TOCReg = DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: TOCFromDescriptorPtr,
3871 PtrInfo: MachinePointerInfo(Func, TOCPointerOffset),
3872 Alignment: PointerAlign, MMOFlags);
3873 SDValue TrampolineTOCPointer =
3874 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Trmp, N2: SDTOCPtrOffset);
3875 SDValue TOCLoadChain = TOCReg.getValue(R: 1);
3876 OutChains[1] =
3877 DAG.getStore(Chain: TOCLoadChain, dl, Val: TOCReg, Ptr: TrampolineTOCPointer,
3878 PtrInfo: MachinePointerInfo(TrampolineAddr, TOCPointerOffset));
3879
3880 // Store the nest argument into the environment pointer in the trampoline
3881 // buffer.
3882 SDValue EnvPointer = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Trmp, N2: SDEnvPtrOffset);
3883 OutChains[2] =
3884 DAG.getStore(Chain, dl, Val: Nest, Ptr: EnvPointer,
3885 PtrInfo: MachinePointerInfo(TrampolineAddr, EnvPointerOffset));
3886
3887 SDValue TokenFactor =
3888 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: OutChains);
3889 return TokenFactor;
3890 }
3891
3892 bool isPPC64 = (PtrVT == MVT::i64);
3893 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
3894
3895 TargetLowering::ArgListTy Args;
3896 Args.emplace_back(args&: Trmp, args&: IntPtrTy);
3897 // TrampSize == (isPPC64 ? 48 : 40);
3898 Args.emplace_back(
3899 args: DAG.getConstant(Val: isPPC64 ? 48 : 40, DL: dl, VT: Subtarget.getScalarIntVT()),
3900 args&: IntPtrTy);
3901 Args.emplace_back(args&: FPtr, args&: IntPtrTy);
3902 Args.emplace_back(args&: Nest, args&: IntPtrTy);
3903
3904 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
3905 TargetLowering::CallLoweringInfo CLI(DAG);
3906 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3907 CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *DAG.getContext()),
3908 Target: DAG.getExternalSymbol(Sym: "__trampoline_setup", VT: PtrVT), ArgsList: std::move(Args));
3909
3910 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3911 return CallResult.second;
3912}
3913
3914SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3915 MachineFunction &MF = DAG.getMachineFunction();
3916 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3917 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
3918
3919 SDLoc dl(Op);
3920
3921 if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
3922 // vastart just stores the address of the VarArgsFrameIndex slot into the
3923 // memory location argument.
3924 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
3925 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
3926 return DAG.getStore(Chain: Op.getOperand(i: 0), dl, Val: FR, Ptr: Op.getOperand(i: 1),
3927 PtrInfo: MachinePointerInfo(SV));
3928 }
3929
3930 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
3931 // We suppose the given va_list is already allocated.
3932 //
3933 // typedef struct {
3934 // char gpr; /* index into the array of 8 GPRs
3935 // * stored in the register save area
3936 // * gpr=0 corresponds to r3,
3937 // * gpr=1 to r4, etc.
3938 // */
3939 // char fpr; /* index into the array of 8 FPRs
3940 // * stored in the register save area
3941 // * fpr=0 corresponds to f1,
3942 // * fpr=1 to f2, etc.
3943 // */
3944 // char *overflow_arg_area;
3945 // /* location on stack that holds
3946 // * the next overflow argument
3947 // */
3948 // char *reg_save_area;
3949 // /* where r3:r10 and f1:f8 (if saved)
3950 // * are stored
3951 // */
3952 // } va_list[1];
3953
3954 SDValue ArgGPR = DAG.getConstant(Val: FuncInfo->getVarArgsNumGPR(), DL: dl, VT: MVT::i32);
3955 SDValue ArgFPR = DAG.getConstant(Val: FuncInfo->getVarArgsNumFPR(), DL: dl, VT: MVT::i32);
3956 SDValue StackOffsetFI = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackOffset(),
3957 VT: PtrVT);
3958 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(),
3959 VT: PtrVT);
3960
3961 uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
3962 SDValue ConstFrameOffset = DAG.getConstant(Val: FrameOffset, DL: dl, VT: PtrVT);
3963
3964 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
3965 SDValue ConstStackOffset = DAG.getConstant(Val: StackOffset, DL: dl, VT: PtrVT);
3966
3967 uint64_t FPROffset = 1;
3968 SDValue ConstFPROffset = DAG.getConstant(Val: FPROffset, DL: dl, VT: PtrVT);
3969
3970 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
3971
3972 // Store first byte : number of int regs
3973 SDValue firstStore =
3974 DAG.getTruncStore(Chain: Op.getOperand(i: 0), dl, Val: ArgGPR, Ptr: Op.getOperand(i: 1),
3975 PtrInfo: MachinePointerInfo(SV), SVT: MVT::i8);
3976 uint64_t nextOffset = FPROffset;
3977 SDValue nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Op.getOperand(i: 1),
3978 N2: ConstFPROffset);
3979
3980 // Store second byte : number of float regs
3981 SDValue secondStore =
3982 DAG.getTruncStore(Chain: firstStore, dl, Val: ArgFPR, Ptr: nextPtr,
3983 PtrInfo: MachinePointerInfo(SV, nextOffset), SVT: MVT::i8);
3984 nextOffset += StackOffset;
3985 nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: nextPtr, N2: ConstStackOffset);
3986
3987 // Store second word : arguments given on stack
3988 SDValue thirdStore = DAG.getStore(Chain: secondStore, dl, Val: StackOffsetFI, Ptr: nextPtr,
3989 PtrInfo: MachinePointerInfo(SV, nextOffset));
3990 nextOffset += FrameOffset;
3991 nextPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: nextPtr, N2: ConstFrameOffset);
3992
3993 // Store third word : arguments given in registers
3994 return DAG.getStore(Chain: thirdStore, dl, Val: FR, Ptr: nextPtr,
3995 PtrInfo: MachinePointerInfo(SV, nextOffset));
3996}
3997
3998/// FPR - The set of FP registers that should be allocated for arguments
3999/// on Darwin and AIX.
4000static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
4001 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
4002 PPC::F11, PPC::F12, PPC::F13};
4003
4004/// CalculateStackSlotSize - Calculates the size reserved for this argument on
4005/// the stack.
4006static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
4007 unsigned PtrByteSize) {
4008 unsigned ArgSize = ArgVT.getStoreSize();
4009 if (Flags.isByVal())
4010 ArgSize = Flags.getByValSize();
4011
4012 // Round up to multiples of the pointer size, except for array members,
4013 // which are always packed.
4014 if (!Flags.isInConsecutiveRegs())
4015 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4016
4017 return ArgSize;
4018}
4019
4020/// CalculateStackSlotAlignment - Calculates the alignment of this argument
4021/// on the stack.
4022static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
4023 ISD::ArgFlagsTy Flags,
4024 unsigned PtrByteSize) {
4025 Align Alignment(PtrByteSize);
4026
4027 // Altivec parameters are padded to a 16 byte boundary.
4028 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4029 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4030 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4031 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4032 Alignment = Align(16);
4033
4034 // ByVal parameters are aligned as requested.
4035 if (Flags.isByVal()) {
4036 auto BVAlign = Flags.getNonZeroByValAlign();
4037 if (BVAlign > PtrByteSize) {
4038 if (BVAlign.value() % PtrByteSize != 0)
4039 llvm_unreachable(
4040 "ByVal alignment is not a multiple of the pointer size");
4041
4042 Alignment = BVAlign;
4043 }
4044 }
4045
4046 // Array members are always packed to their original alignment.
4047 if (Flags.isInConsecutiveRegs()) {
4048 // If the array member was split into multiple registers, the first
4049 // needs to be aligned to the size of the full type. (Except for
4050 // ppcf128, which is only aligned as its f64 components.)
4051 if (Flags.isSplit() && OrigVT != MVT::ppcf128)
4052 Alignment = Align(OrigVT.getStoreSize());
4053 else
4054 Alignment = Align(ArgVT.getStoreSize());
4055 }
4056
4057 return Alignment;
4058}
4059
4060/// CalculateStackSlotUsed - Return whether this argument will use its
4061/// stack slot (instead of being passed in registers). ArgOffset,
4062/// AvailableFPRs, and AvailableVRs must hold the current argument
4063/// position, and will be updated to account for this argument.
4064static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
4065 unsigned PtrByteSize, unsigned LinkageSize,
4066 unsigned ParamAreaSize, unsigned &ArgOffset,
4067 unsigned &AvailableFPRs,
4068 unsigned &AvailableVRs) {
4069 bool UseMemory = false;
4070
4071 // Respect alignment of argument on the stack.
4072 Align Alignment =
4073 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
4074 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
4075 // If there's no space left in the argument save area, we must
4076 // use memory (this check also catches zero-sized arguments).
4077 if (ArgOffset >= LinkageSize + ParamAreaSize)
4078 UseMemory = true;
4079
4080 // Allocate argument on the stack.
4081 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
4082 if (Flags.isInConsecutiveRegsLast())
4083 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4084 // If we overran the argument save area, we must use memory
4085 // (this check catches arguments passed partially in memory)
4086 if (ArgOffset > LinkageSize + ParamAreaSize)
4087 UseMemory = true;
4088
4089 // However, if the argument is actually passed in an FPR or a VR,
4090 // we don't use memory after all.
4091 if (!Flags.isByVal()) {
4092 if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
4093 if (AvailableFPRs > 0) {
4094 --AvailableFPRs;
4095 return false;
4096 }
4097 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4098 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4099 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4100 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4101 if (AvailableVRs > 0) {
4102 --AvailableVRs;
4103 return false;
4104 }
4105 }
4106
4107 return UseMemory;
4108}
4109
4110/// EnsureStackAlignment - Round stack frame size up from NumBytes to
4111/// ensure minimum alignment required for target.
4112static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
4113 unsigned NumBytes) {
4114 return alignTo(Size: NumBytes, A: Lowering->getStackAlign());
4115}
4116
4117SDValue PPCTargetLowering::LowerFormalArguments(
4118 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4119 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4120 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4121 if (Subtarget.isAIXABI())
4122 return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
4123 InVals);
4124 if (Subtarget.is64BitELFABI())
4125 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4126 InVals);
4127 assert(Subtarget.is32BitELFABI());
4128 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4129 InVals);
4130}
4131
4132SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
4133 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4134 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4135 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4136
4137 // 32-bit SVR4 ABI Stack Frame Layout:
4138 // +-----------------------------------+
4139 // +--> | Back chain |
4140 // | +-----------------------------------+
4141 // | | Floating-point register save area |
4142 // | +-----------------------------------+
4143 // | | General register save area |
4144 // | +-----------------------------------+
4145 // | | CR save word |
4146 // | +-----------------------------------+
4147 // | | VRSAVE save word |
4148 // | +-----------------------------------+
4149 // | | Alignment padding |
4150 // | +-----------------------------------+
4151 // | | Vector register save area |
4152 // | +-----------------------------------+
4153 // | | Local variable space |
4154 // | +-----------------------------------+
4155 // | | Parameter list area |
4156 // | +-----------------------------------+
4157 // | | LR save word |
4158 // | +-----------------------------------+
4159 // SP--> +--- | Back chain |
4160 // +-----------------------------------+
4161 //
4162 // Specifications:
4163 // System V Application Binary Interface PowerPC Processor Supplement
4164 // AltiVec Technology Programming Interface Manual
4165
4166 MachineFunction &MF = DAG.getMachineFunction();
4167 MachineFrameInfo &MFI = MF.getFrameInfo();
4168 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4169
4170 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
4171 // Potential tail calls could cause overwriting of argument stack slots.
4172 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4173 (CallConv == CallingConv::Fast));
4174 const Align PtrAlign(4);
4175
4176 // Assign locations to all of the incoming arguments.
4177 SmallVector<CCValAssign, 16> ArgLocs;
4178 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4179 *DAG.getContext());
4180
4181 // Reserve space for the linkage area on the stack.
4182 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4183 CCInfo.AllocateStack(Size: LinkageSize, Alignment: PtrAlign);
4184 CCInfo.AnalyzeFormalArguments(Ins, Fn: CC_PPC32_SVR4);
4185
4186 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4187 CCValAssign &VA = ArgLocs[i];
4188
4189 // Arguments stored in registers.
4190 if (VA.isRegLoc()) {
4191 const TargetRegisterClass *RC;
4192 EVT ValVT = VA.getValVT();
4193
4194 switch (ValVT.getSimpleVT().SimpleTy) {
4195 default:
4196 llvm_unreachable("ValVT not supported by formal arguments Lowering");
4197 case MVT::i1:
4198 case MVT::i32:
4199 RC = &PPC::GPRCRegClass;
4200 break;
4201 case MVT::f32:
4202 if (Subtarget.hasP8Vector())
4203 RC = &PPC::VSSRCRegClass;
4204 else if (Subtarget.hasSPE())
4205 RC = &PPC::GPRCRegClass;
4206 else
4207 RC = &PPC::F4RCRegClass;
4208 break;
4209 case MVT::f64:
4210 if (Subtarget.hasVSX())
4211 RC = &PPC::VSFRCRegClass;
4212 else if (Subtarget.hasSPE())
4213 // SPE passes doubles in GPR pairs.
4214 RC = &PPC::GPRCRegClass;
4215 else
4216 RC = &PPC::F8RCRegClass;
4217 break;
4218 case MVT::v16i8:
4219 case MVT::v8i16:
4220 case MVT::v4i32:
4221 RC = &PPC::VRRCRegClass;
4222 break;
4223 case MVT::v4f32:
4224 RC = &PPC::VRRCRegClass;
4225 break;
4226 case MVT::v2f64:
4227 case MVT::v2i64:
4228 RC = &PPC::VRRCRegClass;
4229 break;
4230 }
4231
4232 SDValue ArgValue;
4233 // Transform the arguments stored in physical registers into
4234 // virtual ones.
4235 if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
4236 assert(i + 1 < e && "No second half of double precision argument");
4237 Register RegLo = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4238 Register RegHi = MF.addLiveIn(PReg: ArgLocs[++i].getLocReg(), RC);
4239 SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, Reg: RegLo, VT: MVT::i32);
4240 SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, Reg: RegHi, VT: MVT::i32);
4241 if (!Subtarget.isLittleEndian())
4242 std::swap (a&: ArgValueLo, b&: ArgValueHi);
4243 ArgValue = DAG.getNode(Opcode: PPCISD::BUILD_SPE64, DL: dl, VT: MVT::f64, N1: ArgValueLo,
4244 N2: ArgValueHi);
4245 } else {
4246 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4247 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
4248 VT: ValVT == MVT::i1 ? MVT::i32 : ValVT);
4249 if (ValVT == MVT::i1)
4250 ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: ArgValue);
4251 }
4252
4253 InVals.push_back(Elt: ArgValue);
4254 } else {
4255 // Argument stored in memory.
4256 assert(VA.isMemLoc());
4257
4258 // Get the extended size of the argument type in stack
4259 unsigned ArgSize = VA.getLocVT().getStoreSize();
4260 // Get the actual size of the argument type
4261 unsigned ObjSize = VA.getValVT().getStoreSize();
4262 unsigned ArgOffset = VA.getLocMemOffset();
4263 // Stack objects in PPC32 are right justified.
4264 ArgOffset += ArgSize - ObjSize;
4265 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: isImmutable);
4266
4267 // Create load nodes to retrieve arguments from the stack.
4268 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4269 InVals.push_back(
4270 Elt: DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo()));
4271 }
4272 }
4273
4274 // Assign locations to all of the incoming aggregate by value arguments.
4275 // Aggregates passed by value are stored in the local variable space of the
4276 // caller's stack frame, right above the parameter list area.
4277 SmallVector<CCValAssign, 16> ByValArgLocs;
4278 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
4279 ByValArgLocs, *DAG.getContext());
4280
4281 // Reserve stack space for the allocations in CCInfo.
4282 CCByValInfo.AllocateStack(Size: CCInfo.getStackSize(), Alignment: PtrAlign);
4283
4284 CCByValInfo.AnalyzeFormalArguments(Ins, Fn: CC_PPC32_SVR4_ByVal);
4285
4286 // Area that is at least reserved in the caller of this function.
4287 unsigned MinReservedArea = CCByValInfo.getStackSize();
4288 MinReservedArea = std::max(a: MinReservedArea, b: LinkageSize);
4289
4290 // Set the size that is at least reserved in caller of this function. Tail
4291 // call optimized function's reserved stack space needs to be aligned so that
4292 // taking the difference between two stack areas will result in an aligned
4293 // stack.
4294 MinReservedArea =
4295 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: MinReservedArea);
4296 FuncInfo->setMinReservedArea(MinReservedArea);
4297
4298 SmallVector<SDValue, 8> MemOps;
4299
4300 // If the function takes variable number of arguments, make a frame index for
4301 // the start of the first vararg value... for expansion of llvm.va_start.
4302 if (isVarArg) {
4303 static const MCPhysReg GPArgRegs[] = {
4304 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
4305 PPC::R7, PPC::R8, PPC::R9, PPC::R10,
4306 };
4307 const unsigned NumGPArgRegs = std::size(GPArgRegs);
4308
4309 static const MCPhysReg FPArgRegs[] = {
4310 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
4311 PPC::F8
4312 };
4313 unsigned NumFPArgRegs = std::size(FPArgRegs);
4314
4315 if (useSoftFloat() || hasSPE())
4316 NumFPArgRegs = 0;
4317
4318 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(Regs: GPArgRegs));
4319 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(Regs: FPArgRegs));
4320
4321 // Make room for NumGPArgRegs and NumFPArgRegs.
4322 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
4323 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
4324
4325 FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject(
4326 Size: PtrVT.getSizeInBits() / 8, SPOffset: CCInfo.getStackSize(), IsImmutable: true));
4327
4328 FuncInfo->setVarArgsFrameIndex(
4329 MFI.CreateStackObject(Size: Depth, Alignment: Align(8), isSpillSlot: false));
4330 SDValue FIN = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4331
4332 // The fixed integer arguments of a variadic function are stored to the
4333 // VarArgsFrameIndex on the stack so that they may be loaded by
4334 // dereferencing the result of va_next.
4335 for (MCPhysReg GPArgReg : GPArgRegs) {
4336 // Get an existing live-in vreg, or add a new one.
4337 Register VReg = MF.getRegInfo().getLiveInVirtReg(PReg: GPArgReg);
4338 if (!VReg)
4339 VReg = MF.addLiveIn(PReg: GPArgReg, RC: &PPC::GPRCRegClass);
4340
4341 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4342 SDValue Store =
4343 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4344 MemOps.push_back(Elt: Store);
4345 // Increment the address by four for the next argument to store
4346 SDValue PtrOff = DAG.getConstant(Val: PtrVT.getSizeInBits()/8, DL: dl, VT: PtrVT);
4347 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4348 }
4349
4350 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
4351 // is set.
4352 // The double arguments are stored to the VarArgsFrameIndex
4353 // on the stack.
4354 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
4355 // Get an existing live-in vreg, or add a new one.
4356 Register VReg = MF.getRegInfo().getLiveInVirtReg(PReg: FPArgRegs[FPRIndex]);
4357 if (!VReg)
4358 VReg = MF.addLiveIn(PReg: FPArgRegs[FPRIndex], RC: &PPC::F8RCRegClass);
4359
4360 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::f64);
4361 SDValue Store =
4362 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4363 MemOps.push_back(Elt: Store);
4364 // Increment the address by eight for the next argument to store
4365 SDValue PtrOff = DAG.getConstant(Val: MVT(MVT::f64).getSizeInBits()/8, DL: dl,
4366 VT: PtrVT);
4367 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4368 }
4369 }
4370
4371 if (!MemOps.empty())
4372 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
4373
4374 return Chain;
4375}
4376
4377// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4378// value to MVT::i64 and then truncate to the correct register size.
4379SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
4380 EVT ObjectVT, SelectionDAG &DAG,
4381 SDValue ArgVal,
4382 const SDLoc &dl) const {
4383 if (Flags.isSExt())
4384 ArgVal = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: MVT::i64, N1: ArgVal,
4385 N2: DAG.getValueType(ObjectVT));
4386 else if (Flags.isZExt())
4387 ArgVal = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: MVT::i64, N1: ArgVal,
4388 N2: DAG.getValueType(ObjectVT));
4389
4390 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ObjectVT, Operand: ArgVal);
4391}
4392
4393SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
4394 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4395 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4396 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4397 // TODO: add description of PPC stack frame format, or at least some docs.
4398 //
4399 bool isELFv2ABI = Subtarget.isELFv2ABI();
4400 bool isLittleEndian = Subtarget.isLittleEndian();
4401 MachineFunction &MF = DAG.getMachineFunction();
4402 MachineFrameInfo &MFI = MF.getFrameInfo();
4403 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4404
4405 assert(!(CallConv == CallingConv::Fast && isVarArg) &&
4406 "fastcc not supported on varargs functions");
4407
4408 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
4409 // Potential tail calls could cause overwriting of argument stack slots.
4410 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4411 (CallConv == CallingConv::Fast));
4412 unsigned PtrByteSize = 8;
4413 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4414
4415 static const MCPhysReg GPR[] = {
4416 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4417 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4418 };
4419 static const MCPhysReg VR[] = {
4420 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4421 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4422 };
4423
4424 const unsigned Num_GPR_Regs = std::size(GPR);
4425 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
4426 const unsigned Num_VR_Regs = std::size(VR);
4427
4428 // Do a first pass over the arguments to determine whether the ABI
4429 // guarantees that our caller has allocated the parameter save area
4430 // on its stack frame. In the ELFv1 ABI, this is always the case;
4431 // in the ELFv2 ABI, it is true if this is a vararg function or if
4432 // any parameter is located in a stack slot.
4433
4434 bool HasParameterArea = !isELFv2ABI || isVarArg;
4435 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
4436 unsigned NumBytes = LinkageSize;
4437 unsigned AvailableFPRs = Num_FPR_Regs;
4438 unsigned AvailableVRs = Num_VR_Regs;
4439 for (const ISD::InputArg &In : Ins) {
4440 if (In.Flags.isNest())
4441 continue;
4442
4443 if (CalculateStackSlotUsed(ArgVT: In.VT, OrigVT: In.ArgVT, Flags: In.Flags, PtrByteSize,
4444 LinkageSize, ParamAreaSize, ArgOffset&: NumBytes,
4445 AvailableFPRs, AvailableVRs))
4446 HasParameterArea = true;
4447 }
4448
4449 // Add DAG nodes to load the arguments or copy them out of registers. On
4450 // entry to a function on PPC, the arguments start after the linkage area,
4451 // although the first ones are often in registers.
4452
4453 unsigned ArgOffset = LinkageSize;
4454 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
4455 SmallVector<SDValue, 8> MemOps;
4456 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
4457 unsigned CurArgIdx = 0;
4458 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
4459 SDValue ArgVal;
4460 bool needsLoad = false;
4461 EVT ObjectVT = Ins[ArgNo].VT;
4462 EVT OrigVT = Ins[ArgNo].ArgVT;
4463 unsigned ObjSize = ObjectVT.getStoreSize();
4464 unsigned ArgSize = ObjSize;
4465 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
4466 if (Ins[ArgNo].isOrigArg()) {
4467 std::advance(i&: FuncArg, n: Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
4468 CurArgIdx = Ins[ArgNo].getOrigArgIndex();
4469 }
4470 // We re-align the argument offset for each argument, except when using the
4471 // fast calling convention, when we need to make sure we do that only when
4472 // we'll actually use a stack slot.
4473 unsigned CurArgOffset;
4474 Align Alignment;
4475 auto ComputeArgOffset = [&]() {
4476 /* Respect alignment of argument on the stack. */
4477 Alignment =
4478 CalculateStackSlotAlignment(ArgVT: ObjectVT, OrigVT, Flags, PtrByteSize);
4479 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
4480 CurArgOffset = ArgOffset;
4481 };
4482
4483 if (CallConv != CallingConv::Fast) {
4484 ComputeArgOffset();
4485
4486 /* Compute GPR index associated with argument offset. */
4487 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4488 GPR_idx = std::min(a: GPR_idx, b: Num_GPR_Regs);
4489 }
4490
4491 // FIXME the codegen can be much improved in some cases.
4492 // We do not have to keep everything in memory.
4493 if (Flags.isByVal()) {
4494 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
4495
4496 if (CallConv == CallingConv::Fast)
4497 ComputeArgOffset();
4498
4499 // ObjSize is the true size, ArgSize rounded up to multiple of registers.
4500 ObjSize = Flags.getByValSize();
4501 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4502 // Empty aggregate parameters do not take up registers. Examples:
4503 // struct { } a;
4504 // union { } b;
4505 // int c[0];
4506 // etc. However, we have to provide a place-holder in InVals, so
4507 // pretend we have an 8-byte item at the current address for that
4508 // purpose.
4509 if (!ObjSize) {
4510 int FI = MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: ArgOffset, IsImmutable: true);
4511 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4512 InVals.push_back(Elt: FIN);
4513 continue;
4514 }
4515
4516 // Create a stack object covering all stack doublewords occupied
4517 // by the argument. If the argument is (fully or partially) on
4518 // the stack, or if the argument is fully in registers but the
4519 // caller has allocated the parameter save anyway, we can refer
4520 // directly to the caller's stack frame. Otherwise, create a
4521 // local copy in our own frame.
4522 int FI;
4523 if (HasParameterArea ||
4524 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
4525 FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: false, isAliased: true);
4526 else
4527 FI = MFI.CreateStackObject(Size: ArgSize, Alignment, isSpillSlot: false);
4528 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4529
4530 // Handle aggregates smaller than 8 bytes.
4531 if (ObjSize < PtrByteSize) {
4532 // The value of the object is its address, which differs from the
4533 // address of the enclosing doubleword on big-endian systems.
4534 SDValue Arg = FIN;
4535 if (!isLittleEndian) {
4536 SDValue ArgOff = DAG.getConstant(Val: PtrByteSize - ObjSize, DL: dl, VT: PtrVT);
4537 Arg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: ArgOff.getValueType(), N1: Arg, N2: ArgOff);
4538 }
4539 InVals.push_back(Elt: Arg);
4540
4541 if (GPR_idx != Num_GPR_Regs) {
4542 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4543 FuncInfo->addLiveInAttr(VReg, Flags);
4544 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4545 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: ObjSize * 8);
4546 SDValue Store =
4547 DAG.getTruncStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: Arg,
4548 PtrInfo: MachinePointerInfo(&*FuncArg), SVT: ObjType);
4549 MemOps.push_back(Elt: Store);
4550 }
4551 // Whether we copied from a register or not, advance the offset
4552 // into the parameter save area by a full doubleword.
4553 ArgOffset += PtrByteSize;
4554 continue;
4555 }
4556
4557 // The value of the object is its address, which is the address of
4558 // its first stack doubleword.
4559 InVals.push_back(Elt: FIN);
4560
4561 // Store whatever pieces of the object are in registers to memory.
4562 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
4563 if (GPR_idx == Num_GPR_Regs)
4564 break;
4565
4566 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx], RC: &PPC::G8RCRegClass);
4567 FuncInfo->addLiveInAttr(VReg, Flags);
4568 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4569 SDValue Addr = FIN;
4570 if (j) {
4571 SDValue Off = DAG.getConstant(Val: j, DL: dl, VT: PtrVT);
4572 Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Off.getValueType(), N1: Addr, N2: Off);
4573 }
4574 unsigned StoreSizeInBits = std::min(a: PtrByteSize, b: (ObjSize - j)) * 8;
4575 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreSizeInBits);
4576 SDValue Store =
4577 DAG.getTruncStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: Addr,
4578 PtrInfo: MachinePointerInfo(&*FuncArg, j), SVT: ObjType);
4579 MemOps.push_back(Elt: Store);
4580 ++GPR_idx;
4581 }
4582 ArgOffset += ArgSize;
4583 continue;
4584 }
4585
4586 switch (ObjectVT.getSimpleVT().SimpleTy) {
4587 default: llvm_unreachable("Unhandled argument type!");
4588 case MVT::i1:
4589 case MVT::i32:
4590 case MVT::i64:
4591 if (Flags.isNest()) {
4592 // The 'nest' parameter, if any, is passed in R11.
4593 Register VReg = MF.addLiveIn(PReg: PPC::X11, RC: &PPC::G8RCRegClass);
4594 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4595
4596 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4597 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4598
4599 break;
4600 }
4601
4602 // These can be scalar arguments or elements of an integer array type
4603 // passed directly. Clang may use those instead of "byval" aggregate
4604 // types to avoid forcing arguments to memory unnecessarily.
4605 if (GPR_idx != Num_GPR_Regs) {
4606 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4607 FuncInfo->addLiveInAttr(VReg, Flags);
4608 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4609
4610 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4611 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4612 // value to MVT::i64 and then truncate to the correct register size.
4613 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4614 } else {
4615 if (CallConv == CallingConv::Fast)
4616 ComputeArgOffset();
4617
4618 needsLoad = true;
4619 ArgSize = PtrByteSize;
4620 }
4621 if (CallConv != CallingConv::Fast || needsLoad)
4622 ArgOffset += 8;
4623 break;
4624
4625 case MVT::f32:
4626 case MVT::f64:
4627 // These can be scalar arguments or elements of a float array type
4628 // passed directly. The latter are used to implement ELFv2 homogenous
4629 // float aggregates.
4630 if (FPR_idx != Num_FPR_Regs) {
4631 unsigned VReg;
4632
4633 if (ObjectVT == MVT::f32)
4634 VReg = MF.addLiveIn(PReg: FPR[FPR_idx],
4635 RC: Subtarget.hasP8Vector()
4636 ? &PPC::VSSRCRegClass
4637 : &PPC::F4RCRegClass);
4638 else
4639 VReg = MF.addLiveIn(PReg: FPR[FPR_idx], RC: Subtarget.hasVSX()
4640 ? &PPC::VSFRCRegClass
4641 : &PPC::F8RCRegClass);
4642
4643 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: ObjectVT);
4644 ++FPR_idx;
4645 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
4646 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
4647 // once we support fp <-> gpr moves.
4648
4649 // This can only ever happen in the presence of f32 array types,
4650 // since otherwise we never run out of FPRs before running out
4651 // of GPRs.
4652 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx++], RC: &PPC::G8RCRegClass);
4653 FuncInfo->addLiveInAttr(VReg, Flags);
4654 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: MVT::i64);
4655
4656 if (ObjectVT == MVT::f32) {
4657 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
4658 ArgVal = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: ArgVal,
4659 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
4660 ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: ArgVal);
4661 }
4662
4663 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ObjectVT, Operand: ArgVal);
4664 } else {
4665 if (CallConv == CallingConv::Fast)
4666 ComputeArgOffset();
4667
4668 needsLoad = true;
4669 }
4670
4671 // When passing an array of floats, the array occupies consecutive
4672 // space in the argument area; only round up to the next doubleword
4673 // at the end of the array. Otherwise, each float takes 8 bytes.
4674 if (CallConv != CallingConv::Fast || needsLoad) {
4675 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
4676 ArgOffset += ArgSize;
4677 if (Flags.isInConsecutiveRegsLast())
4678 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4679 }
4680 break;
4681 case MVT::v4f32:
4682 case MVT::v4i32:
4683 case MVT::v8i16:
4684 case MVT::v16i8:
4685 case MVT::v2f64:
4686 case MVT::v2i64:
4687 case MVT::v1i128:
4688 case MVT::f128:
4689 // These can be scalar arguments or elements of a vector array type
4690 // passed directly. The latter are used to implement ELFv2 homogenous
4691 // vector aggregates.
4692 if (VR_idx != Num_VR_Regs) {
4693 Register VReg = MF.addLiveIn(PReg: VR[VR_idx], RC: &PPC::VRRCRegClass);
4694 ArgVal = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: ObjectVT);
4695 ++VR_idx;
4696 } else {
4697 if (CallConv == CallingConv::Fast)
4698 ComputeArgOffset();
4699 needsLoad = true;
4700 }
4701 if (CallConv != CallingConv::Fast || needsLoad)
4702 ArgOffset += 16;
4703 break;
4704 }
4705
4706 // We need to load the argument to a virtual register if we determined
4707 // above that we ran out of physical registers of the appropriate type.
4708 if (needsLoad) {
4709 if (ObjSize < ArgSize && !isLittleEndian)
4710 CurArgOffset += ArgSize - ObjSize;
4711 int FI = MFI.CreateFixedObject(Size: ObjSize, SPOffset: CurArgOffset, IsImmutable: isImmutable);
4712 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4713 ArgVal = DAG.getLoad(VT: ObjectVT, dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo());
4714 }
4715
4716 InVals.push_back(Elt: ArgVal);
4717 }
4718
4719 // Area that is at least reserved in the caller of this function.
4720 unsigned MinReservedArea;
4721 if (HasParameterArea)
4722 MinReservedArea = std::max(a: ArgOffset, b: LinkageSize + 8 * PtrByteSize);
4723 else
4724 MinReservedArea = LinkageSize;
4725
4726 // Set the size that is at least reserved in caller of this function. Tail
4727 // call optimized functions' reserved stack space needs to be aligned so that
4728 // taking the difference between two stack areas will result in an aligned
4729 // stack.
4730 MinReservedArea =
4731 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: MinReservedArea);
4732 FuncInfo->setMinReservedArea(MinReservedArea);
4733
4734 // If the function takes variable number of arguments, make a frame index for
4735 // the start of the first vararg value... for expansion of llvm.va_start.
4736 // On ELFv2ABI spec, it writes:
4737 // C programs that are intended to be *portable* across different compilers
4738 // and architectures must use the header file <stdarg.h> to deal with variable
4739 // argument lists.
4740 if (isVarArg && MFI.hasVAStart()) {
4741 int Depth = ArgOffset;
4742
4743 FuncInfo->setVarArgsFrameIndex(
4744 MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: Depth, IsImmutable: true));
4745 SDValue FIN = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4746
4747 // If this function is vararg, store any remaining integer argument regs
4748 // to their spots on the stack so that they may be loaded by dereferencing
4749 // the result of va_next.
4750 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4751 GPR_idx < Num_GPR_Regs; ++GPR_idx) {
4752 Register VReg = MF.addLiveIn(PReg: GPR[GPR_idx], RC: &PPC::G8RCRegClass);
4753 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
4754 SDValue Store =
4755 DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MachinePointerInfo());
4756 MemOps.push_back(Elt: Store);
4757 // Increment the address by four for the next argument to store
4758 SDValue PtrOff = DAG.getConstant(Val: PtrByteSize, DL: dl, VT: PtrVT);
4759 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
4760 }
4761 }
4762
4763 if (!MemOps.empty())
4764 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
4765
4766 return Chain;
4767}
4768
4769/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
4770/// adjusted to accommodate the arguments for the tailcall.
4771static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
4772 unsigned ParamSize) {
4773
4774 if (!isTailCall) return 0;
4775
4776 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
4777 unsigned CallerMinReservedArea = FI->getMinReservedArea();
4778 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
4779 // Remember only if the new adjustment is bigger.
4780 if (SPDiff < FI->getTailCallSPDelta())
4781 FI->setTailCallSPDelta(SPDiff);
4782
4783 return SPDiff;
4784}
4785
4786static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV);
4787
4788static bool callsShareTOCBase(const Function *Caller,
4789 const GlobalValue *CalleeGV,
4790 const TargetMachine &TM) {
4791 // It does not make sense to call callsShareTOCBase() with a caller that
4792 // is PC Relative since PC Relative callers do not have a TOC.
4793#ifndef NDEBUG
4794 const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
4795 assert(!STICaller->isUsingPCRelativeCalls() &&
4796 "PC Relative callers do not have a TOC and cannot share a TOC Base");
4797#endif
4798
4799 // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
4800 // don't have enough information to determine if the caller and callee share
4801 // the same TOC base, so we have to pessimistically assume they don't for
4802 // correctness.
4803 if (!CalleeGV)
4804 return false;
4805
4806 // If the callee is preemptable, then the static linker will use a plt-stub
4807 // which saves the toc to the stack, and needs a nop after the call
4808 // instruction to convert to a toc-restore.
4809 if (!TM.shouldAssumeDSOLocal(GV: CalleeGV))
4810 return false;
4811
4812 // Functions with PC Relative enabled may clobber the TOC in the same DSO.
4813 // We may need a TOC restore in the situation where the caller requires a
4814 // valid TOC but the callee is PC Relative and does not.
4815 const Function *F = dyn_cast<Function>(Val: CalleeGV);
4816 const GlobalAlias *Alias = dyn_cast<GlobalAlias>(Val: CalleeGV);
4817
4818 // If we have an Alias we can try to get the function from there.
4819 if (Alias) {
4820 const GlobalObject *GlobalObj = Alias->getAliaseeObject();
4821 F = dyn_cast<Function>(Val: GlobalObj);
4822 }
4823
4824 // If we still have no valid function pointer we do not have enough
4825 // information to determine if the callee uses PC Relative calls so we must
4826 // assume that it does.
4827 if (!F)
4828 return false;
4829
4830 // If the callee uses PC Relative we cannot guarantee that the callee won't
4831 // clobber the TOC of the caller and so we must assume that the two
4832 // functions do not share a TOC base.
4833 const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(F: *F);
4834 if (STICallee->isUsingPCRelativeCalls())
4835 return false;
4836
4837 // If the GV is not a strong definition then we need to assume it can be
4838 // replaced by another function at link time. The function that replaces
4839 // it may not share the same TOC as the caller since the callee may be
4840 // replaced by a PC Relative version of the same function.
4841 if (!CalleeGV->isStrongDefinitionForLinker())
4842 return false;
4843
4844 // The medium and large code models are expected to provide a sufficiently
4845 // large TOC to provide all data addressing needs of a module with a
4846 // single TOC.
4847 if (CodeModel::Medium == TM.getCodeModel() ||
4848 CodeModel::Large == TM.getCodeModel())
4849 return true;
4850
4851 // Any explicitly-specified sections and section prefixes must also match.
4852 // Also, if we're using -ffunction-sections, then each function is always in
4853 // a different section (the same is true for COMDAT functions).
4854 if (TM.getFunctionSections() || CalleeGV->hasComdat() ||
4855 Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection())
4856 return false;
4857 if (const auto *F = dyn_cast<Function>(Val: CalleeGV)) {
4858 if (F->getSectionPrefix() != Caller->getSectionPrefix())
4859 return false;
4860 }
4861
4862 return true;
4863}
4864
4865static bool
4866needStackSlotPassParameters(const PPCSubtarget &Subtarget,
4867 const SmallVectorImpl<ISD::OutputArg> &Outs) {
4868 assert(Subtarget.is64BitELFABI());
4869
4870 const unsigned PtrByteSize = 8;
4871 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4872
4873 static const MCPhysReg GPR[] = {
4874 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4875 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4876 };
4877 static const MCPhysReg VR[] = {
4878 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4879 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4880 };
4881
4882 const unsigned NumGPRs = std::size(GPR);
4883 const unsigned NumFPRs = 13;
4884 const unsigned NumVRs = std::size(VR);
4885 const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
4886
4887 unsigned NumBytes = LinkageSize;
4888 unsigned AvailableFPRs = NumFPRs;
4889 unsigned AvailableVRs = NumVRs;
4890
4891 for (const ISD::OutputArg& Param : Outs) {
4892 if (Param.Flags.isNest()) continue;
4893
4894 if (CalculateStackSlotUsed(ArgVT: Param.VT, OrigVT: Param.ArgVT, Flags: Param.Flags, PtrByteSize,
4895 LinkageSize, ParamAreaSize, ArgOffset&: NumBytes,
4896 AvailableFPRs, AvailableVRs))
4897 return true;
4898 }
4899 return false;
4900}
4901
4902static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
4903 if (CB.arg_size() != CallerFn->arg_size())
4904 return false;
4905
4906 auto CalleeArgIter = CB.arg_begin();
4907 auto CalleeArgEnd = CB.arg_end();
4908 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
4909
4910 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
4911 const Value* CalleeArg = *CalleeArgIter;
4912 const Value* CallerArg = &(*CallerArgIter);
4913 if (CalleeArg == CallerArg)
4914 continue;
4915
4916 // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
4917 // tail call @callee([4 x i64] undef, [4 x i64] %b)
4918 // }
4919 // 1st argument of callee is undef and has the same type as caller.
4920 if (CalleeArg->getType() == CallerArg->getType() &&
4921 isa<UndefValue>(Val: CalleeArg))
4922 continue;
4923
4924 return false;
4925 }
4926
4927 return true;
4928}
4929
4930// Returns true if TCO is possible between the callers and callees
4931// calling conventions.
4932static bool
4933areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
4934 CallingConv::ID CalleeCC) {
4935 // Tail calls are possible with fastcc and ccc.
4936 auto isTailCallableCC = [] (CallingConv::ID CC){
4937 return CC == CallingConv::C || CC == CallingConv::Fast;
4938 };
4939 if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
4940 return false;
4941
4942 // We can safely tail call both fastcc and ccc callees from a c calling
4943 // convention caller. If the caller is fastcc, we may have less stack space
4944 // than a non-fastcc caller with the same signature so disable tail-calls in
4945 // that case.
4946 return CallerCC == CallingConv::C || CallerCC == CalleeCC;
4947}
4948
4949bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
4950 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
4951 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
4952 const SmallVectorImpl<ISD::OutputArg> &Outs,
4953 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
4954 bool isCalleeExternalSymbol) const {
4955 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
4956
4957 if (DisableSCO && !TailCallOpt) return false;
4958
4959 // Variadic argument functions are not supported.
4960 if (isVarArg) return false;
4961
4962 // Check that the calling conventions are compatible for tco.
4963 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC))
4964 return false;
4965
4966 // Caller contains any byval parameter is not supported.
4967 if (any_of(Range: Ins, P: [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
4968 return false;
4969
4970 // Callee contains any byval parameter is not supported, too.
4971 // Note: This is a quick work around, because in some cases, e.g.
4972 // caller's stack size > callee's stack size, we are still able to apply
4973 // sibling call optimization. For example, gcc is able to do SCO for caller1
4974 // in the following example, but not for caller2.
4975 // struct test {
4976 // long int a;
4977 // char ary[56];
4978 // } gTest;
4979 // __attribute__((noinline)) int callee(struct test v, struct test *b) {
4980 // b->a = v.a;
4981 // return 0;
4982 // }
4983 // void caller1(struct test a, struct test c, struct test *b) {
4984 // callee(gTest, b); }
4985 // void caller2(struct test *b) { callee(gTest, b); }
4986 if (any_of(Range: Outs, P: [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
4987 return false;
4988
4989 // If callee and caller use different calling conventions, we cannot pass
4990 // parameters on stack since offsets for the parameter area may be different.
4991 if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs))
4992 return false;
4993
4994 // All variants of 64-bit ELF ABIs without PC-Relative addressing require that
4995 // the caller and callee share the same TOC for TCO/SCO. If the caller and
4996 // callee potentially have different TOC bases then we cannot tail call since
4997 // we need to restore the TOC pointer after the call.
4998 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
4999 // We cannot guarantee this for indirect calls or calls to external functions.
5000 // When PC-Relative addressing is used, the concept of the TOC is no longer
5001 // applicable so this check is not required.
5002 // Check first for indirect calls.
5003 if (!Subtarget.isUsingPCRelativeCalls() &&
5004 !isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol)
5005 return false;
5006
5007 // Check if we share the TOC base.
5008 if (!Subtarget.isUsingPCRelativeCalls() &&
5009 !callsShareTOCBase(Caller: CallerFunc, CalleeGV, TM: getTargetMachine()))
5010 return false;
5011
5012 // TCO allows altering callee ABI, so we don't have to check further.
5013 if (CalleeCC == CallingConv::Fast && TailCallOpt)
5014 return true;
5015
5016 if (DisableSCO) return false;
5017
5018 // If callee use the same argument list that caller is using, then we can
5019 // apply SCO on this case. If it is not, then we need to check if callee needs
5020 // stack for passing arguments.
5021 // PC Relative tail calls may not have a CallBase.
5022 // If there is no CallBase we cannot verify if we have the same argument
5023 // list so assume that we don't have the same argument list.
5024 if (CB && !hasSameArgumentList(CallerFn: CallerFunc, CB: *CB) &&
5025 needStackSlotPassParameters(Subtarget, Outs))
5026 return false;
5027 else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
5028 return false;
5029
5030 return true;
5031}
5032
5033/// IsEligibleForTailCallOptimization - Check whether the call is eligible
5034/// for tail call optimization. Targets which want to do tail call
5035/// optimization should implement this function.
5036bool PPCTargetLowering::IsEligibleForTailCallOptimization(
5037 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5038 CallingConv::ID CallerCC, bool isVarArg,
5039 const SmallVectorImpl<ISD::InputArg> &Ins) const {
5040 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5041 return false;
5042
5043 // Variable argument functions are not supported.
5044 if (isVarArg)
5045 return false;
5046
5047 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
5048 // Functions containing by val parameters are not supported.
5049 if (any_of(Range: Ins, P: [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
5050 return false;
5051
5052 // Non-PIC/GOT tail calls are supported.
5053 if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
5054 return true;
5055
5056 // At the moment we can only do local tail calls (in same module, hidden
5057 // or protected) if we are generating PIC.
5058 if (CalleeGV)
5059 return CalleeGV->hasHiddenVisibility() ||
5060 CalleeGV->hasProtectedVisibility();
5061 }
5062
5063 return false;
5064}
5065
5066/// isCallCompatibleAddress - Return the immediate to use if the specified
5067/// 32-bit value is representable in the immediate field of a BxA instruction.
5068static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
5069 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
5070 if (!C) return nullptr;
5071
5072 int Addr = C->getZExtValue();
5073 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
5074 SignExtend32<26>(X: Addr) != Addr)
5075 return nullptr; // Top 6 bits have to be sext of immediate.
5076
5077 return DAG
5078 .getSignedConstant(
5079 Val: (int)C->getZExtValue() >> 2, DL: SDLoc(Op),
5080 VT: DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout()))
5081 .getNode();
5082}
5083
5084namespace {
5085
5086struct TailCallArgumentInfo {
5087 SDValue Arg;
5088 SDValue FrameIdxOp;
5089 int FrameIdx = 0;
5090
5091 TailCallArgumentInfo() = default;
5092};
5093
5094} // end anonymous namespace
5095
5096/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
5097static void StoreTailCallArgumentsToStackSlot(
5098 SelectionDAG &DAG, SDValue Chain,
5099 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
5100 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
5101 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
5102 SDValue Arg = TailCallArgs[i].Arg;
5103 SDValue FIN = TailCallArgs[i].FrameIdxOp;
5104 int FI = TailCallArgs[i].FrameIdx;
5105 // Store relative to framepointer.
5106 MemOpChains.push_back(Elt: DAG.getStore(
5107 Chain, dl, Val: Arg, Ptr: FIN,
5108 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI)));
5109 }
5110}
5111
5112/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
5113/// the appropriate stack slot for the tail call optimized function call.
5114static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
5115 SDValue OldRetAddr, SDValue OldFP,
5116 int SPDiff, const SDLoc &dl) {
5117 if (SPDiff) {
5118 // Calculate the new stack slot for the return address.
5119 MachineFunction &MF = DAG.getMachineFunction();
5120 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
5121 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
5122 int SlotSize = Subtarget.isPPC64() ? 8 : 4;
5123 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
5124 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(Size: SlotSize,
5125 SPOffset: NewRetAddrLoc, IsImmutable: true);
5126 SDValue NewRetAddrFrIdx =
5127 DAG.getFrameIndex(FI: NewRetAddr, VT: Subtarget.getScalarIntVT());
5128 Chain = DAG.getStore(Chain, dl, Val: OldRetAddr, Ptr: NewRetAddrFrIdx,
5129 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: NewRetAddr));
5130 }
5131 return Chain;
5132}
5133
5134/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
5135/// the position of the argument.
5136static void CalculateTailCallArgDest(
5137 SelectionDAG &DAG, MachineFunction &MF, bool IsPPC64, SDValue Arg,
5138 int SPDiff, unsigned ArgOffset,
5139 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5140 int Offset = ArgOffset + SPDiff;
5141 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
5142 int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
5143 EVT VT = IsPPC64 ? MVT::i64 : MVT::i32;
5144 SDValue FIN = DAG.getFrameIndex(FI, VT);
5145 TailCallArgumentInfo Info;
5146 Info.Arg = Arg;
5147 Info.FrameIdxOp = FIN;
5148 Info.FrameIdx = FI;
5149 TailCallArguments.push_back(Elt: Info);
5150}
5151
5152/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
5153/// stack slot. Returns the chain as result and the loaded frame pointers in
5154/// LROpOut/FPOpout. Used when tail calling.
5155SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
5156 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
5157 SDValue &FPOpOut, const SDLoc &dl) const {
5158 if (SPDiff) {
5159 // Load the LR and FP stack slot for later adjusting.
5160 LROpOut = getReturnAddrFrameIndex(DAG);
5161 LROpOut = DAG.getLoad(VT: Subtarget.getScalarIntVT(), dl, Chain, Ptr: LROpOut,
5162 PtrInfo: MachinePointerInfo());
5163 Chain = SDValue(LROpOut.getNode(), 1);
5164 }
5165 return Chain;
5166}
5167
5168/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
5169/// by "Src" to address "Dst" of size "Size". Alignment information is
5170/// specified by the specific parameter attribute. The copy will be passed as
5171/// a byval function parameter.
5172/// Sometimes what we are copying is the end of a larger object, the part that
5173/// does not fit in registers.
5174static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
5175 SDValue Chain, ISD::ArgFlagsTy Flags,
5176 SelectionDAG &DAG, const SDLoc &dl) {
5177 SDValue SizeNode = DAG.getConstant(Val: Flags.getByValSize(), DL: dl, VT: MVT::i32);
5178 return DAG.getMemcpy(
5179 Chain, dl, Dst, Src, Size: SizeNode, Alignment: Flags.getNonZeroByValAlign(), isVol: false, AlwaysInline: false,
5180 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
5181}
5182
5183/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
5184/// tail calls.
5185static void LowerMemOpCallTo(
5186 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
5187 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
5188 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
5189 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
5190 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
5191 if (!isTailCall) {
5192 if (isVector) {
5193 SDValue StackPtr;
5194 if (isPPC64)
5195 StackPtr = DAG.getRegister(Reg: PPC::X1, VT: MVT::i64);
5196 else
5197 StackPtr = DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
5198 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr,
5199 N2: DAG.getConstant(Val: ArgOffset, DL: dl, VT: PtrVT));
5200 }
5201 MemOpChains.push_back(
5202 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo()));
5203 // Calculate and remember argument location.
5204 } else
5205 CalculateTailCallArgDest(DAG, MF, IsPPC64: isPPC64, Arg, SPDiff, ArgOffset,
5206 TailCallArguments);
5207}
5208
5209static void
5210PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain,
5211 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
5212 SDValue FPOp,
5213 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5214 // Emit a sequence of copyto/copyfrom virtual registers for arguments that
5215 // might overwrite each other in case of tail call optimization.
5216 SmallVector<SDValue, 8> MemOpChains2;
5217 // Do not flag preceding copytoreg stuff together with the following stuff.
5218 InGlue = SDValue();
5219 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArgs: TailCallArguments,
5220 MemOpChains&: MemOpChains2, dl);
5221 if (!MemOpChains2.empty())
5222 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains2);
5223
5224 // Store the return address to the appropriate stack slot.
5225 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, OldRetAddr: LROp, OldFP: FPOp, SPDiff, dl);
5226
5227 // Emit callseq_end just before tailcall node.
5228 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL: dl);
5229 InGlue = Chain.getValue(R: 1);
5230}
5231
5232// Is this global address that of a function that can be called by name? (as
5233// opposed to something that must hold a descriptor for an indirect call).
5234static bool isFunctionGlobalAddress(const GlobalValue *GV) {
5235 if (GV) {
5236 if (GV->isThreadLocal())
5237 return false;
5238
5239 return GV->getValueType()->isFunctionTy();
5240 }
5241
5242 return false;
5243}
5244
5245SDValue PPCTargetLowering::LowerCallResult(
5246 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
5247 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5248 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5249 SmallVector<CCValAssign, 16> RVLocs;
5250 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5251 *DAG.getContext());
5252
5253 CCRetInfo.AnalyzeCallResult(
5254 Ins, Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
5255 ? RetCC_PPC_Cold
5256 : RetCC_PPC);
5257
5258 // Copy all of the result registers out of their specified physreg.
5259 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5260 CCValAssign &VA = RVLocs[i];
5261 assert(VA.isRegLoc() && "Can only return in registers!");
5262
5263 SDValue Val;
5264
5265 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
5266 SDValue Lo = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: MVT::i32,
5267 Glue: InGlue);
5268 Chain = Lo.getValue(R: 1);
5269 InGlue = Lo.getValue(R: 2);
5270 VA = RVLocs[++i]; // skip ahead to next loc
5271 SDValue Hi = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: MVT::i32,
5272 Glue: InGlue);
5273 Chain = Hi.getValue(R: 1);
5274 InGlue = Hi.getValue(R: 2);
5275 if (!Subtarget.isLittleEndian())
5276 std::swap (a&: Lo, b&: Hi);
5277 Val = DAG.getNode(Opcode: PPCISD::BUILD_SPE64, DL: dl, VT: MVT::f64, N1: Lo, N2: Hi);
5278 } else {
5279 Val = DAG.getCopyFromReg(Chain, dl,
5280 Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
5281 Chain = Val.getValue(R: 1);
5282 InGlue = Val.getValue(R: 2);
5283 }
5284
5285 switch (VA.getLocInfo()) {
5286 default: llvm_unreachable("Unknown loc info!");
5287 case CCValAssign::Full: break;
5288 case CCValAssign::AExt:
5289 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5290 break;
5291 case CCValAssign::ZExt:
5292 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: VA.getLocVT(), N1: Val,
5293 N2: DAG.getValueType(VA.getValVT()));
5294 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5295 break;
5296 case CCValAssign::SExt:
5297 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: VA.getLocVT(), N1: Val,
5298 N2: DAG.getValueType(VA.getValVT()));
5299 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
5300 break;
5301 }
5302
5303 InVals.push_back(Elt: Val);
5304 }
5305
5306 return Chain;
5307}
5308
5309static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
5310 const PPCSubtarget &Subtarget, bool isPatchPoint) {
5311 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5312 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5313
5314 // PatchPoint calls are not indirect.
5315 if (isPatchPoint)
5316 return false;
5317
5318 if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Val: Callee))
5319 return false;
5320
5321 // Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
5322 // becuase the immediate function pointer points to a descriptor instead of
5323 // a function entry point. The ELFv2 ABI cannot use a BLA because the function
5324 // pointer immediate points to the global entry point, while the BLA would
5325 // need to jump to the local entry point (see rL211174).
5326 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
5327 isBLACompatibleAddress(Op: Callee, DAG))
5328 return false;
5329
5330 return true;
5331}
5332
5333// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
5334static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
5335 return Subtarget.isAIXABI() ||
5336 (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
5337}
5338
5339static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
5340 const Function &Caller, const SDValue &Callee,
5341 const PPCSubtarget &Subtarget,
5342 const TargetMachine &TM,
5343 bool IsStrictFPCall = false) {
5344 if (CFlags.IsTailCall)
5345 return PPCISD::TC_RETURN;
5346
5347 unsigned RetOpc = 0;
5348 // This is a call through a function pointer.
5349 if (CFlags.IsIndirect) {
5350 // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
5351 // indirect calls. The save of the caller's TOC pointer to the stack will be
5352 // inserted into the DAG as part of call lowering. The restore of the TOC
5353 // pointer is modeled by using a pseudo instruction for the call opcode that
5354 // represents the 2 instruction sequence of an indirect branch and link,
5355 // immediately followed by a load of the TOC pointer from the stack save
5356 // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
5357 // as it is not saved or used.
5358 RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
5359 : PPCISD::BCTRL;
5360 } else if (Subtarget.isUsingPCRelativeCalls()) {
5361 assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
5362 RetOpc = PPCISD::CALL_NOTOC;
5363 } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) {
5364 // The ABIs that maintain a TOC pointer accross calls need to have a nop
5365 // immediately following the call instruction if the caller and callee may
5366 // have different TOC bases. At link time if the linker determines the calls
5367 // may not share a TOC base, the call is redirected to a trampoline inserted
5368 // by the linker. The trampoline will (among other things) save the callers
5369 // TOC pointer at an ABI designated offset in the linkage area and the
5370 // linker will rewrite the nop to be a load of the TOC pointer from the
5371 // linkage area into gpr2.
5372 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5373 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5374 RetOpc =
5375 callsShareTOCBase(Caller: &Caller, CalleeGV: GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP;
5376 } else
5377 RetOpc = PPCISD::CALL;
5378 if (IsStrictFPCall) {
5379 switch (RetOpc) {
5380 default:
5381 llvm_unreachable("Unknown call opcode");
5382 case PPCISD::BCTRL_LOAD_TOC:
5383 RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
5384 break;
5385 case PPCISD::BCTRL:
5386 RetOpc = PPCISD::BCTRL_RM;
5387 break;
5388 case PPCISD::CALL_NOTOC:
5389 RetOpc = PPCISD::CALL_NOTOC_RM;
5390 break;
5391 case PPCISD::CALL:
5392 RetOpc = PPCISD::CALL_RM;
5393 break;
5394 case PPCISD::CALL_NOP:
5395 RetOpc = PPCISD::CALL_NOP_RM;
5396 break;
5397 }
5398 }
5399 return RetOpc;
5400}
5401
5402static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
5403 const SDLoc &dl, const PPCSubtarget &Subtarget) {
5404 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
5405 if (SDNode *Dest = isBLACompatibleAddress(Op: Callee, DAG))
5406 return SDValue(Dest, 0);
5407
5408 // Returns true if the callee is local, and false otherwise.
5409 auto isLocalCallee = [&]() {
5410 const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5411 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5412
5413 return DAG.getTarget().shouldAssumeDSOLocal(GV) &&
5414 !isa_and_nonnull<GlobalIFunc>(Val: GV);
5415 };
5416
5417 // The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
5418 // a static relocation model causes some versions of GNU LD (2.17.50, at
5419 // least) to force BSS-PLT, instead of secure-PLT, even if all objects are
5420 // built with secure-PLT.
5421 bool UsePlt =
5422 Subtarget.is32BitELFABI() && !isLocalCallee() &&
5423 Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
5424
5425 const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
5426 const TargetMachine &TM = Subtarget.getTargetMachine();
5427 const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
5428 auto *S =
5429 static_cast<MCSymbolXCOFF *>(TLOF->getFunctionEntryPointSymbol(Func: GV, TM));
5430
5431 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
5432 return DAG.getMCSymbol(Sym: S, VT: PtrVT);
5433 };
5434
5435 auto *G = dyn_cast<GlobalAddressSDNode>(Val: Callee);
5436 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5437 if (isFunctionGlobalAddress(GV)) {
5438 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val: Callee)->getGlobal();
5439
5440 if (Subtarget.isAIXABI()) {
5441 assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
5442 return getAIXFuncEntryPointSymbolSDNode(GV);
5443 }
5444 return DAG.getTargetGlobalAddress(GV, DL: dl, VT: Callee.getValueType(), offset: 0,
5445 TargetFlags: UsePlt ? PPCII::MO_PLT : 0);
5446 }
5447
5448 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Val: Callee)) {
5449 const char *SymName = S->getSymbol();
5450 if (Subtarget.isAIXABI()) {
5451 // If there exists a user-declared function whose name is the same as the
5452 // ExternalSymbol's, then we pick up the user-declared version.
5453 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
5454 if (const Function *F =
5455 dyn_cast_or_null<Function>(Val: Mod->getNamedValue(Name: SymName)))
5456 return getAIXFuncEntryPointSymbolSDNode(F);
5457
5458 // On AIX, direct function calls reference the symbol for the function's
5459 // entry point, which is named by prepending a "." before the function's
5460 // C-linkage name. A Qualname is returned here because an external
5461 // function entry point is a csect with XTY_ER property.
5462 const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
5463 auto &Context = DAG.getMachineFunction().getContext();
5464 MCSectionXCOFF *Sec = Context.getXCOFFSection(
5465 Section: (Twine(".") + Twine(SymName)).str(), K: SectionKind::getMetadata(),
5466 CsectProp: XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
5467 return Sec->getQualNameSymbol();
5468 };
5469
5470 SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
5471 }
5472 return DAG.getTargetExternalSymbol(Sym: SymName, VT: Callee.getValueType(),
5473 TargetFlags: UsePlt ? PPCII::MO_PLT : 0);
5474 }
5475
5476 // No transformation needed.
5477 assert(Callee.getNode() && "What no callee?");
5478 return Callee;
5479}
5480
5481static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
5482 assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
5483 "Expected a CALLSEQ_STARTSDNode.");
5484
5485 // The last operand is the chain, except when the node has glue. If the node
5486 // has glue, then the last operand is the glue, and the chain is the second
5487 // last operand.
5488 SDValue LastValue = CallSeqStart.getValue(R: CallSeqStart->getNumValues() - 1);
5489 if (LastValue.getValueType() != MVT::Glue)
5490 return LastValue;
5491
5492 return CallSeqStart.getValue(R: CallSeqStart->getNumValues() - 2);
5493}
5494
5495// Creates the node that moves a functions address into the count register
5496// to prepare for an indirect call instruction.
5497static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5498 SDValue &Glue, SDValue &Chain,
5499 const SDLoc &dl) {
5500 SDValue MTCTROps[] = {Chain, Callee, Glue};
5501 EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
5502 Chain = DAG.getNode(Opcode: PPCISD::MTCTR, DL: dl, ResultTys: ReturnTypes,
5503 Ops: ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
5504 // The glue is the second value produced.
5505 Glue = Chain.getValue(R: 1);
5506}
5507
5508static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5509 SDValue &Glue, SDValue &Chain,
5510 SDValue CallSeqStart,
5511 const CallBase *CB, const SDLoc &dl,
5512 bool hasNest,
5513 const PPCSubtarget &Subtarget) {
5514 // Function pointers in the 64-bit SVR4 ABI do not point to the function
5515 // entry point, but to the function descriptor (the function entry point
5516 // address is part of the function descriptor though).
5517 // The function descriptor is a three doubleword structure with the
5518 // following fields: function entry point, TOC base address and
5519 // environment pointer.
5520 // Thus for a call through a function pointer, the following actions need
5521 // to be performed:
5522 // 1. Save the TOC of the caller in the TOC save area of its stack
5523 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
5524 // 2. Load the address of the function entry point from the function
5525 // descriptor.
5526 // 3. Load the TOC of the callee from the function descriptor into r2.
5527 // 4. Load the environment pointer from the function descriptor into
5528 // r11.
5529 // 5. Branch to the function entry point address.
5530 // 6. On return of the callee, the TOC of the caller needs to be
5531 // restored (this is done in FinishCall()).
5532 //
5533 // The loads are scheduled at the beginning of the call sequence, and the
5534 // register copies are flagged together to ensure that no other
5535 // operations can be scheduled in between. E.g. without flagging the
5536 // copies together, a TOC access in the caller could be scheduled between
5537 // the assignment of the callee TOC and the branch to the callee, which leads
5538 // to incorrect code.
5539
5540 // Start by loading the function address from the descriptor.
5541 SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
5542 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
5543 ? (MachineMemOperand::MODereferenceable |
5544 MachineMemOperand::MOInvariant)
5545 : MachineMemOperand::MONone;
5546
5547 MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
5548
5549 // Registers used in building the DAG.
5550 const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
5551 const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
5552
5553 // Offsets of descriptor members.
5554 const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
5555 const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
5556
5557 const MVT RegVT = Subtarget.getScalarIntVT();
5558 const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
5559
5560 // One load for the functions entry point address.
5561 SDValue LoadFuncPtr = DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: Callee, PtrInfo: MPI,
5562 Alignment, MMOFlags);
5563
5564 // One for loading the TOC anchor for the module that contains the called
5565 // function.
5566 SDValue TOCOff = DAG.getIntPtrConstant(Val: TOCAnchorOffset, DL: dl);
5567 SDValue AddTOC = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: Callee, N2: TOCOff);
5568 SDValue TOCPtr =
5569 DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: AddTOC,
5570 PtrInfo: MPI.getWithOffset(O: TOCAnchorOffset), Alignment, MMOFlags);
5571
5572 // One for loading the environment pointer.
5573 SDValue PtrOff = DAG.getIntPtrConstant(Val: EnvPtrOffset, DL: dl);
5574 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: Callee, N2: PtrOff);
5575 SDValue LoadEnvPtr =
5576 DAG.getLoad(VT: RegVT, dl, Chain: LDChain, Ptr: AddPtr,
5577 PtrInfo: MPI.getWithOffset(O: EnvPtrOffset), Alignment, MMOFlags);
5578
5579
5580 // Then copy the newly loaded TOC anchor to the TOC pointer.
5581 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, Reg: TOCReg, N: TOCPtr, Glue);
5582 Chain = TOCVal.getValue(R: 0);
5583 Glue = TOCVal.getValue(R: 1);
5584
5585 // If the function call has an explicit 'nest' parameter, it takes the
5586 // place of the environment pointer.
5587 assert((!hasNest || !Subtarget.isAIXABI()) &&
5588 "Nest parameter is not supported on AIX.");
5589 if (!hasNest) {
5590 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, Reg: EnvPtrReg, N: LoadEnvPtr, Glue);
5591 Chain = EnvVal.getValue(R: 0);
5592 Glue = EnvVal.getValue(R: 1);
5593 }
5594
5595 // The rest of the indirect call sequence is the same as the non-descriptor
5596 // DAG.
5597 prepareIndirectCall(DAG, Callee&: LoadFuncPtr, Glue, Chain, dl);
5598}
5599
5600static void
5601buildCallOperands(SmallVectorImpl<SDValue> &Ops,
5602 PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
5603 SelectionDAG &DAG,
5604 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
5605 SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
5606 const PPCSubtarget &Subtarget) {
5607 const bool IsPPC64 = Subtarget.isPPC64();
5608 // MVT for a general purpose register.
5609 const MVT RegVT = Subtarget.getScalarIntVT();
5610
5611 // First operand is always the chain.
5612 Ops.push_back(Elt: Chain);
5613
5614 // If it's a direct call pass the callee as the second operand.
5615 if (!CFlags.IsIndirect)
5616 Ops.push_back(Elt: Callee);
5617 else {
5618 assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
5619
5620 // For the TOC based ABIs, we have saved the TOC pointer to the linkage area
5621 // on the stack (this would have been done in `LowerCall_64SVR4` or
5622 // `LowerCall_AIX`). The call instruction is a pseudo instruction that
5623 // represents both the indirect branch and a load that restores the TOC
5624 // pointer from the linkage area. The operand for the TOC restore is an add
5625 // of the TOC save offset to the stack pointer. This must be the second
5626 // operand: after the chain input but before any other variadic arguments.
5627 // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
5628 // saved or used.
5629 if (isTOCSaveRestoreRequired(Subtarget)) {
5630 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
5631
5632 SDValue StackPtr = DAG.getRegister(Reg: StackPtrReg, VT: RegVT);
5633 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
5634 SDValue TOCOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
5635 SDValue AddTOC = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RegVT, N1: StackPtr, N2: TOCOff);
5636 Ops.push_back(Elt: AddTOC);
5637 }
5638
5639 // Add the register used for the environment pointer.
5640 if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
5641 Ops.push_back(Elt: DAG.getRegister(Reg: Subtarget.getEnvironmentPointerRegister(),
5642 VT: RegVT));
5643
5644
5645 // Add CTR register as callee so a bctr can be emitted later.
5646 if (CFlags.IsTailCall)
5647 Ops.push_back(Elt: DAG.getRegister(Reg: IsPPC64 ? PPC::CTR8 : PPC::CTR, VT: RegVT));
5648 }
5649
5650 // If this is a tail call add stack pointer delta.
5651 if (CFlags.IsTailCall)
5652 Ops.push_back(Elt: DAG.getConstant(Val: SPDiff, DL: dl, VT: MVT::i32));
5653
5654 // Add argument registers to the end of the list so that they are known live
5655 // into the call.
5656 for (const auto &[Reg, N] : RegsToPass)
5657 Ops.push_back(Elt: DAG.getRegister(Reg, VT: N.getValueType()));
5658
5659 // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
5660 // no way to mark dependencies as implicit here.
5661 // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
5662 if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
5663 !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
5664 Ops.push_back(Elt: DAG.getRegister(Reg: Subtarget.getTOCPointerRegister(), VT: RegVT));
5665
5666 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
5667 if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
5668 Ops.push_back(Elt: DAG.getRegister(Reg: PPC::CR1EQ, VT: MVT::i32));
5669
5670 // Add a register mask operand representing the call-preserved registers.
5671 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
5672 const uint32_t *Mask =
5673 TRI->getCallPreservedMask(MF: DAG.getMachineFunction(), CFlags.CallConv);
5674 assert(Mask && "Missing call preserved mask for calling convention");
5675 Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
5676
5677 // If the glue is valid, it is the last operand.
5678 if (Glue.getNode())
5679 Ops.push_back(Elt: Glue);
5680}
5681
5682SDValue PPCTargetLowering::FinishCall(
5683 CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
5684 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
5685 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
5686 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
5687 SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
5688
5689 if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
5690 Subtarget.isAIXABI())
5691 setUsesTOCBasePtr(DAG);
5692
5693 unsigned CallOpc =
5694 getCallOpcode(CFlags, Caller: DAG.getMachineFunction().getFunction(), Callee,
5695 Subtarget, TM: DAG.getTarget(), IsStrictFPCall: CB ? CB->isStrictFP() : false);
5696
5697 if (!CFlags.IsIndirect)
5698 Callee = transformCallee(Callee, DAG, dl, Subtarget);
5699 else if (Subtarget.usesFunctionDescriptors())
5700 prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
5701 dl, hasNest: CFlags.HasNest, Subtarget);
5702 else
5703 prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
5704
5705 // Build the operand list for the call instruction.
5706 SmallVector<SDValue, 8> Ops;
5707 buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
5708 SPDiff, Subtarget);
5709
5710 // Emit tail call.
5711 if (CFlags.IsTailCall) {
5712 // Indirect tail call when using PC Relative calls do not have the same
5713 // constraints.
5714 assert(((Callee.getOpcode() == ISD::Register &&
5715 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
5716 Callee.getOpcode() == ISD::TargetExternalSymbol ||
5717 Callee.getOpcode() == ISD::TargetGlobalAddress ||
5718 isa<ConstantSDNode>(Callee) ||
5719 (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
5720 "Expecting a global address, external symbol, absolute value, "
5721 "register or an indirect tail call when PC Relative calls are "
5722 "used.");
5723 // PC Relative calls also use TC_RETURN as the way to mark tail calls.
5724 assert(CallOpc == PPCISD::TC_RETURN &&
5725 "Unexpected call opcode for a tail call.");
5726 DAG.getMachineFunction().getFrameInfo().setHasTailCall();
5727 SDValue Ret = DAG.getNode(Opcode: CallOpc, DL: dl, VT: MVT::Other, Ops);
5728 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CFlags.NoMerge);
5729 return Ret;
5730 }
5731
5732 std::array<EVT, 2> ReturnTypes = {._M_elems: {MVT::Other, MVT::Glue}};
5733 Chain = DAG.getNode(Opcode: CallOpc, DL: dl, ResultTys: ReturnTypes, Ops);
5734 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CFlags.NoMerge);
5735 Glue = Chain.getValue(R: 1);
5736
5737 // When performing tail call optimization the callee pops its arguments off
5738 // the stack. Account for this here so these bytes can be pushed back on in
5739 // PPCFrameLowering::eliminateCallFramePseudoInstr.
5740 int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
5741 getTargetMachine().Options.GuaranteedTailCallOpt)
5742 ? NumBytes
5743 : 0;
5744
5745 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: BytesCalleePops, Glue, DL: dl);
5746 Glue = Chain.getValue(R: 1);
5747
5748 return LowerCallResult(Chain, InGlue: Glue, CallConv: CFlags.CallConv, isVarArg: CFlags.IsVarArg, Ins, dl,
5749 DAG, InVals);
5750}
5751
5752bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {
5753 CallingConv::ID CalleeCC = CB->getCallingConv();
5754 const Function *CallerFunc = CB->getCaller();
5755 CallingConv::ID CallerCC = CallerFunc->getCallingConv();
5756 const Function *CalleeFunc = CB->getCalledFunction();
5757 if (!CalleeFunc)
5758 return false;
5759 const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(Val: CalleeFunc);
5760
5761 SmallVector<ISD::OutputArg, 2> Outs;
5762 SmallVector<ISD::InputArg, 2> Ins;
5763
5764 GetReturnInfo(CC: CalleeCC, ReturnType: CalleeFunc->getReturnType(),
5765 attr: CalleeFunc->getAttributes(), Outs, TLI: *this,
5766 DL: CalleeFunc->getDataLayout());
5767
5768 return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,
5769 isVarArg: CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,
5770 isCalleeExternalSymbol: false /*isCalleeExternalSymbol*/);
5771}
5772
5773bool PPCTargetLowering::isEligibleForTCO(
5774 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5775 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5776 const SmallVectorImpl<ISD::OutputArg> &Outs,
5777 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5778 bool isCalleeExternalSymbol) const {
5779 if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
5780 return false;
5781
5782 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
5783 return IsEligibleForTailCallOptimization_64SVR4(
5784 CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc,
5785 isCalleeExternalSymbol);
5786 else
5787 return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC,
5788 isVarArg, Ins);
5789}
5790
5791SDValue
5792PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
5793 SmallVectorImpl<SDValue> &InVals) const {
5794 SelectionDAG &DAG = CLI.DAG;
5795 SDLoc &dl = CLI.DL;
5796 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
5797 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
5798 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
5799 SDValue Chain = CLI.Chain;
5800 SDValue Callee = CLI.Callee;
5801 bool &isTailCall = CLI.IsTailCall;
5802 CallingConv::ID CallConv = CLI.CallConv;
5803 bool isVarArg = CLI.IsVarArg;
5804 bool isPatchPoint = CLI.IsPatchPoint;
5805 const CallBase *CB = CLI.CB;
5806
5807 if (isTailCall) {
5808 MachineFunction &MF = DAG.getMachineFunction();
5809 CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
5810 auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
5811 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5812 bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Val: Callee);
5813
5814 isTailCall =
5815 isEligibleForTCO(CalleeGV: GV, CalleeCC: CallConv, CallerCC, CB, isVarArg, Outs, Ins,
5816 CallerFunc: &(MF.getFunction()), isCalleeExternalSymbol: IsCalleeExternalSymbol);
5817 if (isTailCall) {
5818 ++NumTailCalls;
5819 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5820 ++NumSiblingCalls;
5821
5822 // PC Relative calls no longer guarantee that the callee is a Global
5823 // Address Node. The callee could be an indirect tail call in which
5824 // case the SDValue for the callee could be a load (to load the address
5825 // of a function pointer) or it may be a register copy (to move the
5826 // address of the callee from a function parameter into a virtual
5827 // register). It may also be an ExternalSymbolSDNode (ex memcopy).
5828 assert((Subtarget.isUsingPCRelativeCalls() ||
5829 isa<GlobalAddressSDNode>(Callee)) &&
5830 "Callee should be an llvm::Function object.");
5831
5832 LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
5833 << "\nTCO callee: ");
5834 LLVM_DEBUG(Callee.dump());
5835 }
5836 }
5837
5838 if (!isTailCall && CB && CB->isMustTailCall())
5839 report_fatal_error(reason: "failed to perform tail call elimination on a call "
5840 "site marked musttail");
5841
5842 // When long calls (i.e. indirect calls) are always used, calls are always
5843 // made via function pointer. If we have a function name, first translate it
5844 // into a pointer.
5845 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Val: Callee) &&
5846 !isTailCall)
5847 Callee = LowerGlobalAddress(Op: Callee, DAG);
5848
5849 CallFlags CFlags(
5850 CallConv, isTailCall, isVarArg, isPatchPoint,
5851 isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
5852 // hasNest
5853 Subtarget.is64BitELFABI() &&
5854 any_of(Range&: Outs, P: [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
5855 CLI.NoMerge);
5856
5857 if (Subtarget.isAIXABI())
5858 return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5859 InVals, CB);
5860
5861 assert(Subtarget.isSVR4ABI());
5862 if (Subtarget.isPPC64())
5863 return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5864 InVals, CB);
5865 return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5866 InVals, CB);
5867}
5868
5869SDValue PPCTargetLowering::LowerCall_32SVR4(
5870 SDValue Chain, SDValue Callee, CallFlags CFlags,
5871 const SmallVectorImpl<ISD::OutputArg> &Outs,
5872 const SmallVectorImpl<SDValue> &OutVals,
5873 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5874 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
5875 const CallBase *CB) const {
5876 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
5877 // of the 32-bit SVR4 ABI stack frame layout.
5878
5879 const CallingConv::ID CallConv = CFlags.CallConv;
5880 const bool IsVarArg = CFlags.IsVarArg;
5881 const bool IsTailCall = CFlags.IsTailCall;
5882
5883 assert((CallConv == CallingConv::C ||
5884 CallConv == CallingConv::Cold ||
5885 CallConv == CallingConv::Fast) && "Unknown calling convention!");
5886
5887 const Align PtrAlign(4);
5888
5889 MachineFunction &MF = DAG.getMachineFunction();
5890
5891 // Mark this function as potentially containing a function that contains a
5892 // tail call. As a consequence the frame pointer will be used for dynamicalloc
5893 // and restoring the callers stack pointer in this functions epilog. This is
5894 // done because by tail calling the called function might overwrite the value
5895 // in this function's (MF) stack pointer stack slot 0(SP).
5896 if (getTargetMachine().Options.GuaranteedTailCallOpt &&
5897 CallConv == CallingConv::Fast)
5898 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
5899
5900 // Count how many bytes are to be pushed on the stack, including the linkage
5901 // area, parameter list area and the part of the local variable space which
5902 // contains copies of aggregates which are passed by value.
5903
5904 // Assign locations to all of the outgoing arguments.
5905 SmallVector<CCValAssign, 16> ArgLocs;
5906 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
5907
5908 // Reserve space for the linkage area on the stack.
5909 CCInfo.AllocateStack(Size: Subtarget.getFrameLowering()->getLinkageSize(),
5910 Alignment: PtrAlign);
5911
5912 if (IsVarArg) {
5913 // Handle fixed and variable vector arguments differently.
5914 // Fixed vector arguments go into registers as long as registers are
5915 // available. Variable vector arguments always go into memory.
5916 unsigned NumArgs = Outs.size();
5917
5918 for (unsigned i = 0; i != NumArgs; ++i) {
5919 MVT ArgVT = Outs[i].VT;
5920 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5921 bool Result;
5922
5923 if (!ArgFlags.isVarArg()) {
5924 Result = CC_PPC32_SVR4(ValNo: i, ValVT: ArgVT, LocVT: ArgVT, LocInfo: CCValAssign::Full, ArgFlags,
5925 OrigTy: Outs[i].OrigTy, State&: CCInfo);
5926 } else {
5927 Result = CC_PPC32_SVR4_VarArg(ValNo: i, ValVT: ArgVT, LocVT: ArgVT, LocInfo: CCValAssign::Full,
5928 ArgFlags, OrigTy: Outs[i].OrigTy, State&: CCInfo);
5929 }
5930
5931 if (Result) {
5932#ifndef NDEBUG
5933 errs() << "Call operand #" << i << " has unhandled type "
5934 << ArgVT << "\n";
5935#endif
5936 llvm_unreachable(nullptr);
5937 }
5938 }
5939 } else {
5940 // All arguments are treated the same.
5941 CCInfo.AnalyzeCallOperands(Outs, Fn: CC_PPC32_SVR4);
5942 }
5943
5944 // Assign locations to all of the outgoing aggregate by value arguments.
5945 SmallVector<CCValAssign, 16> ByValArgLocs;
5946 CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
5947
5948 // Reserve stack space for the allocations in CCInfo.
5949 CCByValInfo.AllocateStack(Size: CCInfo.getStackSize(), Alignment: PtrAlign);
5950
5951 CCByValInfo.AnalyzeCallOperands(Outs, Fn: CC_PPC32_SVR4_ByVal);
5952
5953 // Size of the linkage area, parameter list area and the part of the local
5954 // space variable where copies of aggregates which are passed by value are
5955 // stored.
5956 unsigned NumBytes = CCByValInfo.getStackSize();
5957
5958 // Calculate by how many bytes the stack has to be adjusted in case of tail
5959 // call optimization.
5960 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall: IsTailCall, ParamSize: NumBytes);
5961
5962 // Adjust the stack pointer for the new arguments...
5963 // These operations are automatically eliminated by the prolog/epilog pass
5964 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
5965 SDValue CallSeqStart = Chain;
5966
5967 // Load the return address and frame pointer so it can be moved somewhere else
5968 // later.
5969 SDValue LROp, FPOp;
5970 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROpOut&: LROp, FPOpOut&: FPOp, dl);
5971
5972 // Set up a copy of the stack pointer for use loading and storing any
5973 // arguments that may not fit in the registers available for argument
5974 // passing.
5975 SDValue StackPtr = DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
5976
5977 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
5978 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
5979 SmallVector<SDValue, 8> MemOpChains;
5980
5981 bool seenFloatArg = false;
5982 // Walk the register/memloc assignments, inserting copies/loads.
5983 // i - Tracks the index into the list of registers allocated for the call
5984 // RealArgIdx - Tracks the index into the list of actual function arguments
5985 // j - Tracks the index into the list of byval arguments
5986 for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
5987 i != e;
5988 ++i, ++RealArgIdx) {
5989 CCValAssign &VA = ArgLocs[i];
5990 SDValue Arg = OutVals[RealArgIdx];
5991 ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
5992
5993 if (Flags.isByVal()) {
5994 // Argument is an aggregate which is passed by value, thus we need to
5995 // create a copy of it in the local variable space of the current stack
5996 // frame (which is the stack frame of the caller) and pass the address of
5997 // this copy to the callee.
5998 assert((j < ByValArgLocs.size()) && "Index out of bounds!");
5999 CCValAssign &ByValVA = ByValArgLocs[j++];
6000 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
6001
6002 // Memory reserved in the local variable space of the callers stack frame.
6003 unsigned LocMemOffset = ByValVA.getLocMemOffset();
6004
6005 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
6006 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()),
6007 N1: StackPtr, N2: PtrOff);
6008
6009 // Create a copy of the argument in the local area of the current
6010 // stack frame.
6011 SDValue MemcpyCall =
6012 CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff,
6013 Chain: CallSeqStart.getNode()->getOperand(Num: 0),
6014 Flags, DAG, dl);
6015
6016 // This must go outside the CALLSEQ_START..END.
6017 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(Chain: MemcpyCall, InSize: NumBytes, OutSize: 0,
6018 DL: SDLoc(MemcpyCall));
6019 DAG.ReplaceAllUsesWith(From: CallSeqStart.getNode(),
6020 To: NewCallSeqStart.getNode());
6021 Chain = CallSeqStart = NewCallSeqStart;
6022
6023 // Pass the address of the aggregate copy on the stack either in a
6024 // physical register or in the parameter list area of the current stack
6025 // frame to the callee.
6026 Arg = PtrOff;
6027 }
6028
6029 // When useCRBits() is true, there can be i1 arguments.
6030 // It is because getRegisterType(MVT::i1) => MVT::i1,
6031 // and for other integer types getRegisterType() => MVT::i32.
6032 // Extend i1 and ensure callee will get i32.
6033 if (Arg.getValueType() == MVT::i1)
6034 Arg = DAG.getNode(Opcode: Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
6035 DL: dl, VT: MVT::i32, Operand: Arg);
6036
6037 if (VA.isRegLoc()) {
6038 seenFloatArg |= VA.getLocVT().isFloatingPoint();
6039 // Put argument in a physical register.
6040 if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
6041 bool IsLE = Subtarget.isLittleEndian();
6042 SDValue SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
6043 N2: DAG.getIntPtrConstant(Val: IsLE ? 0 : 1, DL: dl));
6044 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y: SVal.getValue(R: 0)));
6045 SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
6046 N2: DAG.getIntPtrConstant(Val: IsLE ? 1 : 0, DL: dl));
6047 RegsToPass.push_back(Elt: std::make_pair(x: ArgLocs[++i].getLocReg(),
6048 y: SVal.getValue(R: 0)));
6049 } else
6050 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
6051 } else {
6052 // Put argument in the parameter list area of the current stack frame.
6053 assert(VA.isMemLoc());
6054 unsigned LocMemOffset = VA.getLocMemOffset();
6055
6056 if (!IsTailCall) {
6057 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
6058 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()),
6059 N1: StackPtr, N2: PtrOff);
6060
6061 MemOpChains.push_back(
6062 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo()));
6063 } else {
6064 // Calculate and remember argument location.
6065 CalculateTailCallArgDest(DAG, MF, IsPPC64: false, Arg, SPDiff, ArgOffset: LocMemOffset,
6066 TailCallArguments);
6067 }
6068 }
6069 }
6070
6071 if (!MemOpChains.empty())
6072 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
6073
6074 // Build a sequence of copy-to-reg nodes chained together with token chain
6075 // and flag operands which copy the outgoing args into the appropriate regs.
6076 SDValue InGlue;
6077 for (const auto &[Reg, N] : RegsToPass) {
6078 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
6079 InGlue = Chain.getValue(R: 1);
6080 }
6081
6082 // Set CR bit 6 to true if this is a vararg call with floating args passed in
6083 // registers.
6084 if (IsVarArg) {
6085 SDVTList VTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6086 SDValue Ops[] = { Chain, InGlue };
6087
6088 Chain = DAG.getNode(Opcode: seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, DL: dl,
6089 VTList: VTs, Ops: ArrayRef(Ops, InGlue.getNode() ? 2 : 1));
6090
6091 InGlue = Chain.getValue(R: 1);
6092 }
6093
6094 if (IsTailCall)
6095 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6096 TailCallArguments);
6097
6098 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
6099 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6100}
6101
6102// Copy an argument into memory, being careful to do this outside the
6103// call sequence for the call to which the argument belongs.
6104SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
6105 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
6106 SelectionDAG &DAG, const SDLoc &dl) const {
6107 SDValue MemcpyCall = CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff,
6108 Chain: CallSeqStart.getNode()->getOperand(Num: 0),
6109 Flags, DAG, dl);
6110 // The MEMCPY must go outside the CALLSEQ_START..END.
6111 int64_t FrameSize = CallSeqStart.getConstantOperandVal(i: 1);
6112 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(Chain: MemcpyCall, InSize: FrameSize, OutSize: 0,
6113 DL: SDLoc(MemcpyCall));
6114 DAG.ReplaceAllUsesWith(From: CallSeqStart.getNode(),
6115 To: NewCallSeqStart.getNode());
6116 return NewCallSeqStart;
6117}
6118
6119SDValue PPCTargetLowering::LowerCall_64SVR4(
6120 SDValue Chain, SDValue Callee, CallFlags CFlags,
6121 const SmallVectorImpl<ISD::OutputArg> &Outs,
6122 const SmallVectorImpl<SDValue> &OutVals,
6123 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
6124 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
6125 const CallBase *CB) const {
6126 bool isELFv2ABI = Subtarget.isELFv2ABI();
6127 bool isLittleEndian = Subtarget.isLittleEndian();
6128 unsigned NumOps = Outs.size();
6129 bool IsSibCall = false;
6130 bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
6131
6132 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6133 unsigned PtrByteSize = 8;
6134
6135 MachineFunction &MF = DAG.getMachineFunction();
6136
6137 if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
6138 IsSibCall = true;
6139
6140 // Mark this function as potentially containing a function that contains a
6141 // tail call. As a consequence the frame pointer will be used for dynamicalloc
6142 // and restoring the callers stack pointer in this functions epilog. This is
6143 // done because by tail calling the called function might overwrite the value
6144 // in this function's (MF) stack pointer stack slot 0(SP).
6145 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6146 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
6147
6148 assert(!(IsFastCall && CFlags.IsVarArg) &&
6149 "fastcc not supported on varargs functions");
6150
6151 // Count how many bytes are to be pushed on the stack, including the linkage
6152 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
6153 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
6154 // area is 32 bytes reserved space for [SP][CR][LR][TOC].
6155 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
6156 unsigned NumBytes = LinkageSize;
6157 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
6158
6159 static const MCPhysReg GPR[] = {
6160 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6161 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
6162 };
6163 static const MCPhysReg VR[] = {
6164 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
6165 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
6166 };
6167
6168 const unsigned NumGPRs = std::size(GPR);
6169 const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
6170 const unsigned NumVRs = std::size(VR);
6171
6172 // On ELFv2, we can avoid allocating the parameter area if all the arguments
6173 // can be passed to the callee in registers.
6174 // For the fast calling convention, there is another check below.
6175 // Note: We should keep consistent with LowerFormalArguments_64SVR4()
6176 bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
6177 if (!HasParameterArea) {
6178 unsigned ParamAreaSize = NumGPRs * PtrByteSize;
6179 unsigned AvailableFPRs = NumFPRs;
6180 unsigned AvailableVRs = NumVRs;
6181 unsigned NumBytesTmp = NumBytes;
6182 for (unsigned i = 0; i != NumOps; ++i) {
6183 if (Outs[i].Flags.isNest()) continue;
6184 if (CalculateStackSlotUsed(ArgVT: Outs[i].VT, OrigVT: Outs[i].ArgVT, Flags: Outs[i].Flags,
6185 PtrByteSize, LinkageSize, ParamAreaSize,
6186 ArgOffset&: NumBytesTmp, AvailableFPRs, AvailableVRs))
6187 HasParameterArea = true;
6188 }
6189 }
6190
6191 // When using the fast calling convention, we don't provide backing for
6192 // arguments that will be in registers.
6193 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
6194
6195 // Avoid allocating parameter area for fastcc functions if all the arguments
6196 // can be passed in the registers.
6197 if (IsFastCall)
6198 HasParameterArea = false;
6199
6200 // Add up all the space actually used.
6201 for (unsigned i = 0; i != NumOps; ++i) {
6202 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6203 EVT ArgVT = Outs[i].VT;
6204 EVT OrigVT = Outs[i].ArgVT;
6205
6206 if (Flags.isNest())
6207 continue;
6208
6209 if (IsFastCall) {
6210 if (Flags.isByVal()) {
6211 NumGPRsUsed += (Flags.getByValSize()+7)/8;
6212 if (NumGPRsUsed > NumGPRs)
6213 HasParameterArea = true;
6214 } else {
6215 switch (ArgVT.getSimpleVT().SimpleTy) {
6216 default: llvm_unreachable("Unexpected ValueType for argument!");
6217 case MVT::i1:
6218 case MVT::i32:
6219 case MVT::i64:
6220 if (++NumGPRsUsed <= NumGPRs)
6221 continue;
6222 break;
6223 case MVT::v4i32:
6224 case MVT::v8i16:
6225 case MVT::v16i8:
6226 case MVT::v2f64:
6227 case MVT::v2i64:
6228 case MVT::v1i128:
6229 case MVT::f128:
6230 if (++NumVRsUsed <= NumVRs)
6231 continue;
6232 break;
6233 case MVT::v4f32:
6234 if (++NumVRsUsed <= NumVRs)
6235 continue;
6236 break;
6237 case MVT::f32:
6238 case MVT::f64:
6239 if (++NumFPRsUsed <= NumFPRs)
6240 continue;
6241 break;
6242 }
6243 HasParameterArea = true;
6244 }
6245 }
6246
6247 /* Respect alignment of argument on the stack. */
6248 auto Alignement =
6249 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6250 NumBytes = alignTo(Size: NumBytes, A: Alignement);
6251
6252 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
6253 if (Flags.isInConsecutiveRegsLast())
6254 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6255 }
6256
6257 unsigned NumBytesActuallyUsed = NumBytes;
6258
6259 // In the old ELFv1 ABI,
6260 // the prolog code of the callee may store up to 8 GPR argument registers to
6261 // the stack, allowing va_start to index over them in memory if its varargs.
6262 // Because we cannot tell if this is needed on the caller side, we have to
6263 // conservatively assume that it is needed. As such, make sure we have at
6264 // least enough stack space for the caller to store the 8 GPRs.
6265 // In the ELFv2 ABI, we allocate the parameter area iff a callee
6266 // really requires memory operands, e.g. a vararg function.
6267 if (HasParameterArea)
6268 NumBytes = std::max(a: NumBytes, b: LinkageSize + 8 * PtrByteSize);
6269 else
6270 NumBytes = LinkageSize;
6271
6272 // Tail call needs the stack to be aligned.
6273 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6274 NumBytes = EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes);
6275
6276 int SPDiff = 0;
6277
6278 // Calculate by how many bytes the stack has to be adjusted in case of tail
6279 // call optimization.
6280 if (!IsSibCall)
6281 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall: CFlags.IsTailCall, ParamSize: NumBytes);
6282
6283 // To protect arguments on the stack from being clobbered in a tail call,
6284 // force all the loads to happen before doing any other lowering.
6285 if (CFlags.IsTailCall)
6286 Chain = DAG.getStackArgumentTokenFactor(Chain);
6287
6288 // Adjust the stack pointer for the new arguments...
6289 // These operations are automatically eliminated by the prolog/epilog pass
6290 if (!IsSibCall)
6291 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
6292 SDValue CallSeqStart = Chain;
6293
6294 // Load the return address and frame pointer so it can be move somewhere else
6295 // later.
6296 SDValue LROp, FPOp;
6297 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROpOut&: LROp, FPOpOut&: FPOp, dl);
6298
6299 // Set up a copy of the stack pointer for use loading and storing any
6300 // arguments that may not fit in the registers available for argument
6301 // passing.
6302 SDValue StackPtr = DAG.getRegister(Reg: PPC::X1, VT: MVT::i64);
6303
6304 // Figure out which arguments are going to go in registers, and which in
6305 // memory. Also, if this is a vararg function, floating point operations
6306 // must be stored to our stack, and loaded into integer regs as well, if
6307 // any integer regs are available for argument passing.
6308 unsigned ArgOffset = LinkageSize;
6309
6310 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6311 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6312
6313 SmallVector<SDValue, 8> MemOpChains;
6314 for (unsigned i = 0; i != NumOps; ++i) {
6315 SDValue Arg = OutVals[i];
6316 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6317 EVT ArgVT = Outs[i].VT;
6318 EVT OrigVT = Outs[i].ArgVT;
6319
6320 // PtrOff will be used to store the current argument to the stack if a
6321 // register cannot be found for it.
6322 SDValue PtrOff;
6323
6324 // We re-align the argument offset for each argument, except when using the
6325 // fast calling convention, when we need to make sure we do that only when
6326 // we'll actually use a stack slot.
6327 auto ComputePtrOff = [&]() {
6328 /* Respect alignment of argument on the stack. */
6329 auto Alignment =
6330 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6331 ArgOffset = alignTo(Size: ArgOffset, A: Alignment);
6332
6333 PtrOff = DAG.getConstant(Val: ArgOffset, DL: dl, VT: StackPtr.getValueType());
6334
6335 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
6336 };
6337
6338 if (!IsFastCall) {
6339 ComputePtrOff();
6340
6341 /* Compute GPR index associated with argument offset. */
6342 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
6343 GPR_idx = std::min(a: GPR_idx, b: NumGPRs);
6344 }
6345
6346 // Promote integers to 64-bit values.
6347 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
6348 // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
6349 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6350 Arg = DAG.getNode(Opcode: ExtOp, DL: dl, VT: MVT::i64, Operand: Arg);
6351 }
6352
6353 // FIXME memcpy is used way more than necessary. Correctness first.
6354 // Note: "by value" is code for passing a structure by value, not
6355 // basic types.
6356 if (Flags.isByVal()) {
6357 // Note: Size includes alignment padding, so
6358 // struct x { short a; char b; }
6359 // will have Size = 4. With #pragma pack(1), it will have Size = 3.
6360 // These are the proper values we need for right-justifying the
6361 // aggregate in a parameter register.
6362 unsigned Size = Flags.getByValSize();
6363
6364 // An empty aggregate parameter takes up no storage and no
6365 // registers.
6366 if (Size == 0)
6367 continue;
6368
6369 if (IsFastCall)
6370 ComputePtrOff();
6371
6372 // All aggregates smaller than 8 bytes must be passed right-justified.
6373 if (Size==1 || Size==2 || Size==4) {
6374 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
6375 if (GPR_idx != NumGPRs) {
6376 SDValue Load = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: PtrVT, Chain, Ptr: Arg,
6377 PtrInfo: MachinePointerInfo(), MemVT: VT);
6378 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6379 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6380
6381 ArgOffset += PtrByteSize;
6382 continue;
6383 }
6384 }
6385
6386 if (GPR_idx == NumGPRs && Size < 8) {
6387 SDValue AddPtr = PtrOff;
6388 if (!isLittleEndian) {
6389 SDValue Const = DAG.getConstant(Val: PtrByteSize - Size, DL: dl,
6390 VT: PtrOff.getValueType());
6391 AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: Const);
6392 }
6393 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff: AddPtr,
6394 CallSeqStart,
6395 Flags, DAG, dl);
6396 ArgOffset += PtrByteSize;
6397 continue;
6398 }
6399 // Copy the object to parameter save area if it can not be entirely passed
6400 // by registers.
6401 // FIXME: we only need to copy the parts which need to be passed in
6402 // parameter save area. For the parts passed by registers, we don't need
6403 // to copy them to the stack although we need to allocate space for them
6404 // in parameter save area.
6405 if ((NumGPRs - GPR_idx) * PtrByteSize < Size)
6406 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
6407 CallSeqStart,
6408 Flags, DAG, dl);
6409
6410 // When a register is available, pass a small aggregate right-justified.
6411 if (Size < 8 && GPR_idx != NumGPRs) {
6412 // The easiest way to get this right-justified in a register
6413 // is to copy the structure into the rightmost portion of a
6414 // local variable slot, then load the whole slot into the
6415 // register.
6416 // FIXME: The memcpy seems to produce pretty awful code for
6417 // small aggregates, particularly for packed ones.
6418 // FIXME: It would be preferable to use the slot in the
6419 // parameter save area instead of a new local variable.
6420 SDValue AddPtr = PtrOff;
6421 if (!isLittleEndian) {
6422 SDValue Const = DAG.getConstant(Val: 8 - Size, DL: dl, VT: PtrOff.getValueType());
6423 AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: Const);
6424 }
6425 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff: AddPtr,
6426 CallSeqStart,
6427 Flags, DAG, dl);
6428
6429 // Load the slot into the register.
6430 SDValue Load =
6431 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6432 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6433 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6434
6435 // Done with this argument.
6436 ArgOffset += PtrByteSize;
6437 continue;
6438 }
6439
6440 // For aggregates larger than PtrByteSize, copy the pieces of the
6441 // object that fit into registers from the parameter save area.
6442 for (unsigned j=0; j<Size; j+=PtrByteSize) {
6443 SDValue Const = DAG.getConstant(Val: j, DL: dl, VT: PtrOff.getValueType());
6444 SDValue AddArg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg, N2: Const);
6445 if (GPR_idx != NumGPRs) {
6446 unsigned LoadSizeInBits = std::min(a: PtrByteSize, b: (Size - j)) * 8;
6447 EVT ObjType = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LoadSizeInBits);
6448 SDValue Load = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: PtrVT, Chain, Ptr: AddArg,
6449 PtrInfo: MachinePointerInfo(), MemVT: ObjType);
6450
6451 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6452 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6453 ArgOffset += PtrByteSize;
6454 } else {
6455 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
6456 break;
6457 }
6458 }
6459 continue;
6460 }
6461
6462 switch (Arg.getSimpleValueType().SimpleTy) {
6463 default: llvm_unreachable("Unexpected ValueType for argument!");
6464 case MVT::i1:
6465 case MVT::i32:
6466 case MVT::i64:
6467 if (Flags.isNest()) {
6468 // The 'nest' parameter, if any, is passed in R11.
6469 RegsToPass.push_back(Elt: std::make_pair(x: PPC::X11, y&: Arg));
6470 break;
6471 }
6472
6473 // These can be scalar arguments or elements of an integer array type
6474 // passed directly. Clang may use those instead of "byval" aggregate
6475 // types to avoid forcing arguments to memory unnecessarily.
6476 if (GPR_idx != NumGPRs) {
6477 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Arg));
6478 } else {
6479 if (IsFastCall)
6480 ComputePtrOff();
6481
6482 assert(HasParameterArea &&
6483 "Parameter area must exist to pass an argument in memory.");
6484 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6485 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: false, MemOpChains,
6486 TailCallArguments, dl);
6487 if (IsFastCall)
6488 ArgOffset += PtrByteSize;
6489 }
6490 if (!IsFastCall)
6491 ArgOffset += PtrByteSize;
6492 break;
6493 case MVT::f32:
6494 case MVT::f64: {
6495 // These can be scalar arguments or elements of a float array type
6496 // passed directly. The latter are used to implement ELFv2 homogenous
6497 // float aggregates.
6498
6499 // Named arguments go into FPRs first, and once they overflow, the
6500 // remaining arguments go into GPRs and then the parameter save area.
6501 // Unnamed arguments for vararg functions always go to GPRs and
6502 // then the parameter save area. For now, put all arguments to vararg
6503 // routines always in both locations (FPR *and* GPR or stack slot).
6504 bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
6505 bool NeededLoad = false;
6506
6507 // First load the argument into the next available FPR.
6508 if (FPR_idx != NumFPRs)
6509 RegsToPass.push_back(Elt: std::make_pair(x: FPR[FPR_idx++], y&: Arg));
6510
6511 // Next, load the argument into GPR or stack slot if needed.
6512 if (!NeedGPROrStack)
6513 ;
6514 else if (GPR_idx != NumGPRs && !IsFastCall) {
6515 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
6516 // once we support fp <-> gpr moves.
6517
6518 // In the non-vararg case, this can only ever happen in the
6519 // presence of f32 array types, since otherwise we never run
6520 // out of FPRs before running out of GPRs.
6521 SDValue ArgVal;
6522
6523 // Double values are always passed in a single GPR.
6524 if (Arg.getValueType() != MVT::f32) {
6525 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: Arg);
6526
6527 // Non-array float values are extended and passed in a GPR.
6528 } else if (!Flags.isInConsecutiveRegs()) {
6529 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6530 ArgVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: ArgVal);
6531
6532 // If we have an array of floats, we collect every odd element
6533 // together with its predecessor into one GPR.
6534 } else if (ArgOffset % PtrByteSize != 0) {
6535 SDValue Lo, Hi;
6536 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: OutVals[i - 1]);
6537 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6538 if (!isLittleEndian)
6539 std::swap(a&: Lo, b&: Hi);
6540 ArgVal = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Lo, N2: Hi);
6541
6542 // The final element, if even, goes into the first half of a GPR.
6543 } else if (Flags.isInConsecutiveRegsLast()) {
6544 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32, Operand: Arg);
6545 ArgVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: ArgVal);
6546 if (!isLittleEndian)
6547 ArgVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: ArgVal,
6548 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i32));
6549
6550 // Non-final even elements are skipped; they will be handled
6551 // together the with subsequent argument on the next go-around.
6552 } else
6553 ArgVal = SDValue();
6554
6555 if (ArgVal.getNode())
6556 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: ArgVal));
6557 } else {
6558 if (IsFastCall)
6559 ComputePtrOff();
6560
6561 // Single-precision floating-point values are mapped to the
6562 // second (rightmost) word of the stack doubleword.
6563 if (Arg.getValueType() == MVT::f32 &&
6564 !isLittleEndian && !Flags.isInConsecutiveRegs()) {
6565 SDValue ConstFour = DAG.getConstant(Val: 4, DL: dl, VT: PtrOff.getValueType());
6566 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff, N2: ConstFour);
6567 }
6568
6569 assert(HasParameterArea &&
6570 "Parameter area must exist to pass an argument in memory.");
6571 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6572 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: false, MemOpChains,
6573 TailCallArguments, dl);
6574
6575 NeededLoad = true;
6576 }
6577 // When passing an array of floats, the array occupies consecutive
6578 // space in the argument area; only round up to the next doubleword
6579 // at the end of the array. Otherwise, each float takes 8 bytes.
6580 if (!IsFastCall || NeededLoad) {
6581 ArgOffset += (Arg.getValueType() == MVT::f32 &&
6582 Flags.isInConsecutiveRegs()) ? 4 : 8;
6583 if (Flags.isInConsecutiveRegsLast())
6584 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6585 }
6586 break;
6587 }
6588 case MVT::v4f32:
6589 case MVT::v4i32:
6590 case MVT::v8i16:
6591 case MVT::v16i8:
6592 case MVT::v2f64:
6593 case MVT::v2i64:
6594 case MVT::v1i128:
6595 case MVT::f128:
6596 // These can be scalar arguments or elements of a vector array type
6597 // passed directly. The latter are used to implement ELFv2 homogenous
6598 // vector aggregates.
6599
6600 // For a varargs call, named arguments go into VRs or on the stack as
6601 // usual; unnamed arguments always go to the stack or the corresponding
6602 // GPRs when within range. For now, we always put the value in both
6603 // locations (or even all three).
6604 if (CFlags.IsVarArg) {
6605 assert(HasParameterArea &&
6606 "Parameter area must exist if we have a varargs call.");
6607 // We could elide this store in the case where the object fits
6608 // entirely in R registers. Maybe later.
6609 SDValue Store =
6610 DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6611 MemOpChains.push_back(Elt: Store);
6612 if (VR_idx != NumVRs) {
6613 SDValue Load =
6614 DAG.getLoad(VT: MVT::v4f32, dl, Chain: Store, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
6615 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6616 RegsToPass.push_back(Elt: std::make_pair(x: VR[VR_idx++], y&: Load));
6617 }
6618 ArgOffset += 16;
6619 for (unsigned i=0; i<16; i+=PtrByteSize) {
6620 if (GPR_idx == NumGPRs)
6621 break;
6622 SDValue Ix = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff,
6623 N2: DAG.getConstant(Val: i, DL: dl, VT: PtrVT));
6624 SDValue Load =
6625 DAG.getLoad(VT: PtrVT, dl, Chain: Store, Ptr: Ix, PtrInfo: MachinePointerInfo());
6626 MemOpChains.push_back(Elt: Load.getValue(R: 1));
6627 RegsToPass.push_back(Elt: std::make_pair(x: GPR[GPR_idx++], y&: Load));
6628 }
6629 break;
6630 }
6631
6632 // Non-varargs Altivec params go into VRs or on the stack.
6633 if (VR_idx != NumVRs) {
6634 RegsToPass.push_back(Elt: std::make_pair(x: VR[VR_idx++], y&: Arg));
6635 } else {
6636 if (IsFastCall)
6637 ComputePtrOff();
6638
6639 assert(HasParameterArea &&
6640 "Parameter area must exist to pass an argument in memory.");
6641 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6642 isPPC64: true, isTailCall: CFlags.IsTailCall, isVector: true, MemOpChains,
6643 TailCallArguments, dl);
6644 if (IsFastCall)
6645 ArgOffset += 16;
6646 }
6647
6648 if (!IsFastCall)
6649 ArgOffset += 16;
6650 break;
6651 }
6652 }
6653
6654 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
6655 "mismatch in size of parameter area");
6656 (void)NumBytesActuallyUsed;
6657
6658 if (!MemOpChains.empty())
6659 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
6660
6661 // Check if this is an indirect call (MTCTR/BCTRL).
6662 // See prepareDescriptorIndirectCall and buildCallOperands for more
6663 // information about calls through function pointers in the 64-bit SVR4 ABI.
6664 if (CFlags.IsIndirect) {
6665 // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
6666 // caller in the TOC save area.
6667 if (isTOCSaveRestoreRequired(Subtarget)) {
6668 assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
6669 // Load r2 into a virtual register and store it to the TOC save area.
6670 setUsesTOCBasePtr(DAG);
6671 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: PPC::X2, VT: MVT::i64);
6672 // TOC save area offset.
6673 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
6674 SDValue PtrOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
6675 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
6676 Chain = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: AddPtr,
6677 PtrInfo: MachinePointerInfo::getStack(
6678 MF&: DAG.getMachineFunction(), Offset: TOCSaveOffset));
6679 }
6680 // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
6681 // This does not mean the MTCTR instruction must use R12; it's easier
6682 // to model this as an extra parameter, so do that.
6683 if (isELFv2ABI && !CFlags.IsPatchPoint)
6684 RegsToPass.push_back(Elt: std::make_pair(x: (unsigned)PPC::X12, y&: Callee));
6685 }
6686
6687 // Build a sequence of copy-to-reg nodes chained together with token chain
6688 // and flag operands which copy the outgoing args into the appropriate regs.
6689 SDValue InGlue;
6690 for (const auto &[Reg, N] : RegsToPass) {
6691 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
6692 InGlue = Chain.getValue(R: 1);
6693 }
6694
6695 if (CFlags.IsTailCall && !IsSibCall)
6696 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6697 TailCallArguments);
6698
6699 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
6700 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6701}
6702
6703// Returns true when the shadow of a general purpose argument register
6704// in the parameter save area is aligned to at least 'RequiredAlign'.
6705static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
6706 assert(RequiredAlign.value() <= 16 &&
6707 "Required alignment greater than stack alignment.");
6708 switch (Reg) {
6709 default:
6710 report_fatal_error(reason: "called on invalid register.");
6711 case PPC::R5:
6712 case PPC::R9:
6713 case PPC::X3:
6714 case PPC::X5:
6715 case PPC::X7:
6716 case PPC::X9:
6717 // These registers are 16 byte aligned which is the most strict aligment
6718 // we can support.
6719 return true;
6720 case PPC::R3:
6721 case PPC::R7:
6722 case PPC::X4:
6723 case PPC::X6:
6724 case PPC::X8:
6725 case PPC::X10:
6726 // The shadow of these registers in the PSA is 8 byte aligned.
6727 return RequiredAlign <= 8;
6728 case PPC::R4:
6729 case PPC::R6:
6730 case PPC::R8:
6731 case PPC::R10:
6732 return RequiredAlign <= 4;
6733 }
6734}
6735
6736static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
6737 CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
6738 Type *OrigTy, CCState &State) {
6739 const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
6740 State.getMachineFunction().getSubtarget());
6741 const bool IsPPC64 = Subtarget.isPPC64();
6742 const unsigned PtrSize = IsPPC64 ? 8 : 4;
6743 const Align PtrAlign(PtrSize);
6744 const Align StackAlign(16);
6745 const MVT RegVT = Subtarget.getScalarIntVT();
6746
6747 if (ValVT == MVT::f128)
6748 report_fatal_error(reason: "f128 is unimplemented on AIX.");
6749
6750 static const MCPhysReg GPR_32[] = {// 32-bit registers.
6751 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
6752 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
6753 static const MCPhysReg GPR_64[] = {// 64-bit registers.
6754 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6755 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
6756
6757 static const MCPhysReg VR[] = {// Vector registers.
6758 PPC::V2, PPC::V3, PPC::V4, PPC::V5,
6759 PPC::V6, PPC::V7, PPC::V8, PPC::V9,
6760 PPC::V10, PPC::V11, PPC::V12, PPC::V13};
6761
6762 const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
6763
6764 if (ArgFlags.isNest()) {
6765 MCRegister EnvReg = State.AllocateReg(Reg: IsPPC64 ? PPC::X11 : PPC::R11);
6766 if (!EnvReg)
6767 report_fatal_error(reason: "More then one nest argument.");
6768 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: EnvReg, LocVT: RegVT, HTP: LocInfo));
6769 return false;
6770 }
6771
6772 if (ArgFlags.isByVal()) {
6773 const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
6774 if (ByValAlign > StackAlign)
6775 report_fatal_error(reason: "Pass-by-value arguments with alignment greater than "
6776 "16 are not supported.");
6777
6778 const unsigned ByValSize = ArgFlags.getByValSize();
6779 const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;
6780
6781 // An empty aggregate parameter takes up no storage and no registers,
6782 // but needs a MemLoc for a stack slot for the formal arguments side.
6783 if (ByValSize == 0) {
6784 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6785 Offset: State.getStackSize(), LocVT: RegVT, HTP: LocInfo));
6786 return false;
6787 }
6788
6789 // Shadow allocate any registers that are not properly aligned.
6790 unsigned NextReg = State.getFirstUnallocated(Regs: GPRs);
6791 while (NextReg != GPRs.size() &&
6792 !isGPRShadowAligned(Reg: GPRs[NextReg], RequiredAlign: ObjAlign)) {
6793 // Shadow allocate next registers since its aligment is not strict enough.
6794 MCRegister Reg = State.AllocateReg(Regs: GPRs);
6795 // Allocate the stack space shadowed by said register.
6796 State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6797 assert(Reg && "Alocating register unexpectedly failed.");
6798 (void)Reg;
6799 NextReg = State.getFirstUnallocated(Regs: GPRs);
6800 }
6801
6802 const unsigned StackSize = alignTo(Size: ByValSize, A: ObjAlign);
6803 unsigned Offset = State.AllocateStack(Size: StackSize, Alignment: ObjAlign);
6804 for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
6805 if (MCRegister Reg = State.AllocateReg(Regs: GPRs))
6806 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6807 else {
6808 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6809 Offset, LocVT: MVT::INVALID_SIMPLE_VALUE_TYPE,
6810 HTP: LocInfo));
6811 break;
6812 }
6813 }
6814 return false;
6815 }
6816
6817 // Arguments always reserve parameter save area.
6818 switch (ValVT.SimpleTy) {
6819 default:
6820 report_fatal_error(reason: "Unhandled value type for argument.");
6821 case MVT::i64:
6822 // i64 arguments should have been split to i32 for PPC32.
6823 assert(IsPPC64 && "PPC32 should have split i64 values.");
6824 [[fallthrough]];
6825 case MVT::i1:
6826 case MVT::i32: {
6827 const unsigned Offset = State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6828 // AIX integer arguments are always passed in register width.
6829 if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
6830 LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
6831 : CCValAssign::LocInfo::ZExt;
6832 if (MCRegister Reg = State.AllocateReg(Regs: GPRs))
6833 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6834 else
6835 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT: RegVT, HTP: LocInfo));
6836
6837 return false;
6838 }
6839 case MVT::f32:
6840 case MVT::f64: {
6841 // Parameter save area (PSA) is reserved even if the float passes in fpr.
6842 const unsigned StoreSize = LocVT.getStoreSize();
6843 // Floats are always 4-byte aligned in the PSA on AIX.
6844 // This includes f64 in 64-bit mode for ABI compatibility.
6845 const unsigned Offset =
6846 State.AllocateStack(Size: IsPPC64 ? 8 : StoreSize, Alignment: Align(4));
6847 MCRegister FReg = State.AllocateReg(Regs: FPR);
6848 if (FReg)
6849 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: FReg, LocVT, HTP: LocInfo));
6850
6851 // Reserve and initialize GPRs or initialize the PSA as required.
6852 for (unsigned I = 0; I < StoreSize; I += PtrSize) {
6853 if (MCRegister Reg = State.AllocateReg(Regs: GPRs)) {
6854 assert(FReg && "An FPR should be available when a GPR is reserved.");
6855 if (State.isVarArg()) {
6856 // Successfully reserved GPRs are only initialized for vararg calls.
6857 // Custom handling is required for:
6858 // f64 in PPC32 needs to be split into 2 GPRs.
6859 // f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
6860 State.addLoc(
6861 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6862 }
6863 } else {
6864 // If there are insufficient GPRs, the PSA needs to be initialized.
6865 // Initialization occurs even if an FPR was initialized for
6866 // compatibility with the AIX XL compiler. The full memory for the
6867 // argument will be initialized even if a prior word is saved in GPR.
6868 // A custom memLoc is used when the argument also passes in FPR so
6869 // that the callee handling can skip over it easily.
6870 State.addLoc(
6871 V: FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
6872 HTP: LocInfo)
6873 : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6874 break;
6875 }
6876 }
6877
6878 return false;
6879 }
6880 case MVT::v4f32:
6881 case MVT::v4i32:
6882 case MVT::v8i16:
6883 case MVT::v16i8:
6884 case MVT::v2i64:
6885 case MVT::v2f64:
6886 case MVT::v1i128: {
6887 const unsigned VecSize = 16;
6888 const Align VecAlign(VecSize);
6889
6890 if (!State.isVarArg()) {
6891 // If there are vector registers remaining we don't consume any stack
6892 // space.
6893 if (MCRegister VReg = State.AllocateReg(Regs: VR)) {
6894 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: VReg, LocVT, HTP: LocInfo));
6895 return false;
6896 }
6897 // Vectors passed on the stack do not shadow GPRs or FPRs even though they
6898 // might be allocated in the portion of the PSA that is shadowed by the
6899 // GPRs.
6900 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6901 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6902 return false;
6903 }
6904
6905 unsigned NextRegIndex = State.getFirstUnallocated(Regs: GPRs);
6906 // Burn any underaligned registers and their shadowed stack space until
6907 // we reach the required alignment.
6908 while (NextRegIndex != GPRs.size() &&
6909 !isGPRShadowAligned(Reg: GPRs[NextRegIndex], RequiredAlign: VecAlign)) {
6910 // Shadow allocate register and its stack shadow.
6911 MCRegister Reg = State.AllocateReg(Regs: GPRs);
6912 State.AllocateStack(Size: PtrSize, Alignment: PtrAlign);
6913 assert(Reg && "Allocating register unexpectedly failed.");
6914 (void)Reg;
6915 NextRegIndex = State.getFirstUnallocated(Regs: GPRs);
6916 }
6917
6918 // Vectors that are passed as fixed arguments are handled differently.
6919 // They are passed in VRs if any are available (unlike arguments passed
6920 // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
6921 // functions)
6922 if (!ArgFlags.isVarArg()) {
6923 if (MCRegister VReg = State.AllocateReg(Regs: VR)) {
6924 State.addLoc(V: CCValAssign::getReg(ValNo, ValVT, Reg: VReg, LocVT, HTP: LocInfo));
6925 // Shadow allocate GPRs and stack space even though we pass in a VR.
6926 for (unsigned I = 0; I != VecSize; I += PtrSize)
6927 State.AllocateReg(Regs: GPRs);
6928 State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6929 return false;
6930 }
6931 // No vector registers remain so pass on the stack.
6932 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6933 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6934 return false;
6935 }
6936
6937 // If all GPRS are consumed then we pass the argument fully on the stack.
6938 if (NextRegIndex == GPRs.size()) {
6939 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6940 State.addLoc(V: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6941 return false;
6942 }
6943
6944 // Corner case for 32-bit codegen. We have 2 registers to pass the first
6945 // half of the argument, and then need to pass the remaining half on the
6946 // stack.
6947 if (GPRs[NextRegIndex] == PPC::R9) {
6948 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6949 State.addLoc(
6950 V: CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6951
6952 const MCRegister FirstReg = State.AllocateReg(Reg: PPC::R9);
6953 const MCRegister SecondReg = State.AllocateReg(Reg: PPC::R10);
6954 assert(FirstReg && SecondReg &&
6955 "Allocating R9 or R10 unexpectedly failed.");
6956 State.addLoc(
6957 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg: FirstReg, LocVT: RegVT, HTP: LocInfo));
6958 State.addLoc(
6959 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg: SecondReg, LocVT: RegVT, HTP: LocInfo));
6960 return false;
6961 }
6962
6963 // We have enough GPRs to fully pass the vector argument, and we have
6964 // already consumed any underaligned registers. Start with the custom
6965 // MemLoc and then the custom RegLocs.
6966 const unsigned Offset = State.AllocateStack(Size: VecSize, Alignment: VecAlign);
6967 State.addLoc(
6968 V: CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, HTP: LocInfo));
6969 for (unsigned I = 0; I != VecSize; I += PtrSize) {
6970 const MCRegister Reg = State.AllocateReg(Regs: GPRs);
6971 assert(Reg && "Failed to allocated register for vararg vector argument");
6972 State.addLoc(
6973 V: CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT: RegVT, HTP: LocInfo));
6974 }
6975 return false;
6976 }
6977 }
6978 return true;
6979}
6980
6981// So far, this function is only used by LowerFormalArguments_AIX()
6982static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
6983 bool IsPPC64,
6984 bool HasP8Vector,
6985 bool HasVSX) {
6986 assert((IsPPC64 || SVT != MVT::i64) &&
6987 "i64 should have been split for 32-bit codegen.");
6988
6989 switch (SVT) {
6990 default:
6991 report_fatal_error(reason: "Unexpected value type for formal argument");
6992 case MVT::i1:
6993 case MVT::i32:
6994 case MVT::i64:
6995 return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
6996 case MVT::f32:
6997 return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;
6998 case MVT::f64:
6999 return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;
7000 case MVT::v4f32:
7001 case MVT::v4i32:
7002 case MVT::v8i16:
7003 case MVT::v16i8:
7004 case MVT::v2i64:
7005 case MVT::v2f64:
7006 case MVT::v1i128:
7007 return &PPC::VRRCRegClass;
7008 }
7009}
7010
7011static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
7012 SelectionDAG &DAG, SDValue ArgValue,
7013 MVT LocVT, const SDLoc &dl) {
7014 assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
7015 assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
7016
7017 if (Flags.isSExt())
7018 ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: LocVT, N1: ArgValue,
7019 N2: DAG.getValueType(ValVT));
7020 else if (Flags.isZExt())
7021 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: LocVT, N1: ArgValue,
7022 N2: DAG.getValueType(ValVT));
7023
7024 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ValVT, Operand: ArgValue);
7025}
7026
7027static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
7028 const unsigned LASize = FL->getLinkageSize();
7029
7030 if (PPC::GPRCRegClass.contains(Reg)) {
7031 assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
7032 "Reg must be a valid argument register!");
7033 return LASize + 4 * (Reg - PPC::R3);
7034 }
7035
7036 if (PPC::G8RCRegClass.contains(Reg)) {
7037 assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
7038 "Reg must be a valid argument register!");
7039 return LASize + 8 * (Reg - PPC::X3);
7040 }
7041
7042 llvm_unreachable("Only general purpose registers expected.");
7043}
7044
7045// AIX ABI Stack Frame Layout:
7046//
7047// Low Memory +--------------------------------------------+
7048// SP +---> | Back chain | ---+
7049// | +--------------------------------------------+ |
7050// | | Saved Condition Register | |
7051// | +--------------------------------------------+ |
7052// | | Saved Linkage Register | |
7053// | +--------------------------------------------+ | Linkage Area
7054// | | Reserved for compilers | |
7055// | +--------------------------------------------+ |
7056// | | Reserved for binders | |
7057// | +--------------------------------------------+ |
7058// | | Saved TOC pointer | ---+
7059// | +--------------------------------------------+
7060// | | Parameter save area |
7061// | +--------------------------------------------+
7062// | | Alloca space |
7063// | +--------------------------------------------+
7064// | | Local variable space |
7065// | +--------------------------------------------+
7066// | | Float/int conversion temporary |
7067// | +--------------------------------------------+
7068// | | Save area for AltiVec registers |
7069// | +--------------------------------------------+
7070// | | AltiVec alignment padding |
7071// | +--------------------------------------------+
7072// | | Save area for VRSAVE register |
7073// | +--------------------------------------------+
7074// | | Save area for General Purpose registers |
7075// | +--------------------------------------------+
7076// | | Save area for Floating Point registers |
7077// | +--------------------------------------------+
7078// +---- | Back chain |
7079// High Memory +--------------------------------------------+
7080//
7081// Specifications:
7082// AIX 7.2 Assembler Language Reference
7083// Subroutine linkage convention
7084
7085SDValue PPCTargetLowering::LowerFormalArguments_AIX(
7086 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7087 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7088 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7089
7090 assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
7091 CallConv == CallingConv::Fast) &&
7092 "Unexpected calling convention!");
7093
7094 if (getTargetMachine().Options.GuaranteedTailCallOpt)
7095 report_fatal_error(reason: "Tail call support is unimplemented on AIX.");
7096
7097 if (useSoftFloat())
7098 report_fatal_error(reason: "Soft float support is unimplemented on AIX.");
7099
7100 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7101
7102 const bool IsPPC64 = Subtarget.isPPC64();
7103 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7104
7105 // Assign locations to all of the incoming arguments.
7106 SmallVector<CCValAssign, 16> ArgLocs;
7107 MachineFunction &MF = DAG.getMachineFunction();
7108 MachineFrameInfo &MFI = MF.getFrameInfo();
7109 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
7110 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7111
7112 const EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7113 // Reserve space for the linkage area on the stack.
7114 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7115 CCInfo.AllocateStack(Size: LinkageSize, Alignment: Align(PtrByteSize));
7116 uint64_t SaveStackPos = CCInfo.getStackSize();
7117 bool SaveParams = MF.getFunction().hasFnAttribute(Kind: "save-reg-params");
7118 CCInfo.AnalyzeFormalArguments(Ins, Fn: CC_AIX);
7119
7120 SmallVector<SDValue, 8> MemOps;
7121
7122 for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
7123 CCValAssign &VA = ArgLocs[I++];
7124 MVT LocVT = VA.getLocVT();
7125 MVT ValVT = VA.getValVT();
7126 ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
7127
7128 EVT ArgVT = Ins[VA.getValNo()].ArgVT;
7129 bool ArgSignExt = Ins[VA.getValNo()].Flags.isSExt();
7130 // For compatibility with the AIX XL compiler, the float args in the
7131 // parameter save area are initialized even if the argument is available
7132 // in register. The caller is required to initialize both the register
7133 // and memory, however, the callee can choose to expect it in either.
7134 // The memloc is dismissed here because the argument is retrieved from
7135 // the register.
7136 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
7137 continue;
7138
7139 if (SaveParams && VA.isRegLoc() && !Flags.isByVal() && !VA.needsCustom()) {
7140 const TargetRegisterClass *RegClass = getRegClassForSVT(
7141 SVT: LocVT.SimpleTy, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(), HasVSX: Subtarget.hasVSX());
7142 // On PPC64, debugger assumes extended 8-byte values are stored from GPR.
7143 MVT SaveVT = RegClass == &PPC::G8RCRegClass ? MVT::i64 : LocVT;
7144 const Register VReg = MF.addLiveIn(PReg: VA.getLocReg(), RC: RegClass);
7145 SDValue Parm = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: SaveVT);
7146 int FI = MFI.CreateFixedObject(Size: SaveVT.getStoreSize(), SPOffset: SaveStackPos, IsImmutable: true);
7147 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7148 SDValue StoreReg = DAG.getStore(Chain, dl, Val: Parm, Ptr: FIN,
7149 PtrInfo: MachinePointerInfo(), Alignment: Align(PtrByteSize));
7150 SaveStackPos = alignTo(Value: SaveStackPos + SaveVT.getStoreSize(), Align: PtrByteSize);
7151 MemOps.push_back(Elt: StoreReg);
7152 }
7153
7154 if (SaveParams && (VA.isMemLoc() || Flags.isByVal()) && !VA.needsCustom()) {
7155 unsigned StoreSize =
7156 Flags.isByVal() ? Flags.getByValSize() : LocVT.getStoreSize();
7157 SaveStackPos = alignTo(Value: SaveStackPos + StoreSize, Align: PtrByteSize);
7158 }
7159
7160 auto HandleMemLoc = [&]() {
7161 const unsigned LocSize = LocVT.getStoreSize();
7162 const unsigned ValSize = ValVT.getStoreSize();
7163 assert((ValSize <= LocSize) &&
7164 "Object size is larger than size of MemLoc");
7165 int CurArgOffset = VA.getLocMemOffset();
7166 // Objects are right-justified because AIX is big-endian.
7167 if (LocSize > ValSize)
7168 CurArgOffset += LocSize - ValSize;
7169 // Potential tail calls could cause overwriting of argument stack slots.
7170 const bool IsImmutable =
7171 !(getTargetMachine().Options.GuaranteedTailCallOpt &&
7172 (CallConv == CallingConv::Fast));
7173 int FI = MFI.CreateFixedObject(Size: ValSize, SPOffset: CurArgOffset, IsImmutable);
7174 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7175 SDValue ArgValue =
7176 DAG.getLoad(VT: ValVT, dl, Chain, Ptr: FIN, PtrInfo: MachinePointerInfo());
7177
7178 // While the ABI specifies the argument type is (sign or zero) extended
7179 // out to register width, not all code is compliant. We truncate and
7180 // re-extend to be more forgiving of these callers when the argument type
7181 // is smaller than register width.
7182 if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
7183 ValVT.isInteger() &&
7184 ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
7185 // It is possible to have either real integer values
7186 // or integers that were not originally integers.
7187 // In the latter case, these could have came from structs,
7188 // and these integers would not have an extend on the parameter.
7189 // Since these types of integers do not have an extend specified
7190 // in the first place, the type of extend that we do should not matter.
7191 EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1
7192 ? MVT::i8
7193 : ArgVT;
7194 SDValue ArgValueTrunc =
7195 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: TruncatedArgVT, Operand: ArgValue);
7196 SDValue ArgValueExt =
7197 ArgSignExt ? DAG.getSExtOrTrunc(Op: ArgValueTrunc, DL: dl, VT: ValVT)
7198 : DAG.getZExtOrTrunc(Op: ArgValueTrunc, DL: dl, VT: ValVT);
7199 InVals.push_back(Elt: ArgValueExt);
7200 } else {
7201 InVals.push_back(Elt: ArgValue);
7202 }
7203 };
7204
7205 // Vector arguments to VaArg functions are passed both on the stack, and
7206 // in any available GPRs. Load the value from the stack and add the GPRs
7207 // as live ins.
7208 if (VA.isMemLoc() && VA.needsCustom()) {
7209 assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");
7210 assert(isVarArg && "Only use custom memloc for vararg.");
7211 // ValNo of the custom MemLoc, so we can compare it to the ValNo of the
7212 // matching custom RegLocs.
7213 const unsigned OriginalValNo = VA.getValNo();
7214 (void)OriginalValNo;
7215
7216 auto HandleCustomVecRegLoc = [&]() {
7217 assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7218 "Missing custom RegLoc.");
7219 VA = ArgLocs[I++];
7220 assert(VA.getValVT().isVector() &&
7221 "Unexpected Val type for custom RegLoc.");
7222 assert(VA.getValNo() == OriginalValNo &&
7223 "ValNo mismatch between custom MemLoc and RegLoc.");
7224 MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;
7225 MF.addLiveIn(PReg: VA.getLocReg(),
7226 RC: getRegClassForSVT(SVT, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(),
7227 HasVSX: Subtarget.hasVSX()));
7228 };
7229
7230 HandleMemLoc();
7231 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7232 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7233 // R10.
7234 HandleCustomVecRegLoc();
7235 HandleCustomVecRegLoc();
7236
7237 // If we are targeting 32-bit, there might be 2 extra custom RegLocs if
7238 // we passed the vector in R5, R6, R7 and R8.
7239 if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {
7240 assert(!IsPPC64 &&
7241 "Only 2 custom RegLocs expected for 64-bit codegen.");
7242 HandleCustomVecRegLoc();
7243 HandleCustomVecRegLoc();
7244 }
7245
7246 continue;
7247 }
7248
7249 if (VA.isRegLoc()) {
7250 if (VA.getValVT().isScalarInteger())
7251 FuncInfo->appendParameterType(Type: PPCFunctionInfo::FixedType);
7252 else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {
7253 switch (VA.getValVT().SimpleTy) {
7254 default:
7255 report_fatal_error(reason: "Unhandled value type for argument.");
7256 case MVT::f32:
7257 FuncInfo->appendParameterType(Type: PPCFunctionInfo::ShortFloatingPoint);
7258 break;
7259 case MVT::f64:
7260 FuncInfo->appendParameterType(Type: PPCFunctionInfo::LongFloatingPoint);
7261 break;
7262 }
7263 } else if (VA.getValVT().isVector()) {
7264 switch (VA.getValVT().SimpleTy) {
7265 default:
7266 report_fatal_error(reason: "Unhandled value type for argument.");
7267 case MVT::v16i8:
7268 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorChar);
7269 break;
7270 case MVT::v8i16:
7271 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorShort);
7272 break;
7273 case MVT::v4i32:
7274 case MVT::v2i64:
7275 case MVT::v1i128:
7276 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorInt);
7277 break;
7278 case MVT::v4f32:
7279 case MVT::v2f64:
7280 FuncInfo->appendParameterType(Type: PPCFunctionInfo::VectorFloat);
7281 break;
7282 }
7283 }
7284 }
7285
7286 if (Flags.isByVal() && VA.isMemLoc()) {
7287 const unsigned Size =
7288 alignTo(Value: Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
7289 Align: PtrByteSize);
7290 const int FI = MF.getFrameInfo().CreateFixedObject(
7291 Size, SPOffset: VA.getLocMemOffset(), /* IsImmutable */ false,
7292 /* IsAliased */ isAliased: true);
7293 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7294 InVals.push_back(Elt: FIN);
7295
7296 continue;
7297 }
7298
7299 if (Flags.isByVal()) {
7300 assert(VA.isRegLoc() && "MemLocs should already be handled.");
7301
7302 const MCPhysReg ArgReg = VA.getLocReg();
7303 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
7304
7305 const unsigned StackSize = alignTo(Value: Flags.getByValSize(), Align: PtrByteSize);
7306 const int FI = MF.getFrameInfo().CreateFixedObject(
7307 Size: StackSize, SPOffset: mapArgRegToOffsetAIX(Reg: ArgReg, FL), /* IsImmutable */ false,
7308 /* IsAliased */ isAliased: true);
7309 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
7310 InVals.push_back(Elt: FIN);
7311
7312 // Add live ins for all the RegLocs for the same ByVal.
7313 const TargetRegisterClass *RegClass =
7314 IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7315
7316 auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
7317 unsigned Offset) {
7318 const Register VReg = MF.addLiveIn(PReg: PhysReg, RC: RegClass);
7319 // Since the callers side has left justified the aggregate in the
7320 // register, we can simply store the entire register into the stack
7321 // slot.
7322 SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: LocVT);
7323 // The store to the fixedstack object is needed becuase accessing a
7324 // field of the ByVal will use a gep and load. Ideally we will optimize
7325 // to extracting the value from the register directly, and elide the
7326 // stores when the arguments address is not taken, but that will need to
7327 // be future work.
7328 SDValue Store = DAG.getStore(
7329 Chain: CopyFrom.getValue(R: 1), dl, Val: CopyFrom,
7330 Ptr: DAG.getObjectPtrOffset(SL: dl, Ptr: FIN, Offset: TypeSize::getFixed(ExactSize: Offset)),
7331 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI, Offset));
7332
7333 MemOps.push_back(Elt: Store);
7334 };
7335
7336 unsigned Offset = 0;
7337 HandleRegLoc(VA.getLocReg(), Offset);
7338 Offset += PtrByteSize;
7339 for (; Offset != StackSize && ArgLocs[I].isRegLoc();
7340 Offset += PtrByteSize) {
7341 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7342 "RegLocs should be for ByVal argument.");
7343
7344 const CCValAssign RL = ArgLocs[I++];
7345 HandleRegLoc(RL.getLocReg(), Offset);
7346 FuncInfo->appendParameterType(Type: PPCFunctionInfo::FixedType);
7347 }
7348
7349 if (Offset != StackSize) {
7350 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7351 "Expected MemLoc for remaining bytes.");
7352 assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
7353 // Consume the MemLoc.The InVal has already been emitted, so nothing
7354 // more needs to be done.
7355 ++I;
7356 }
7357
7358 continue;
7359 }
7360
7361 if (VA.isRegLoc() && !VA.needsCustom()) {
7362 MVT::SimpleValueType SVT = ValVT.SimpleTy;
7363 Register VReg =
7364 MF.addLiveIn(PReg: VA.getLocReg(),
7365 RC: getRegClassForSVT(SVT, IsPPC64, HasP8Vector: Subtarget.hasP8Vector(),
7366 HasVSX: Subtarget.hasVSX()));
7367 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: LocVT);
7368 if (ValVT.isScalarInteger() &&
7369 (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
7370 ArgValue =
7371 truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
7372 }
7373 InVals.push_back(Elt: ArgValue);
7374 continue;
7375 }
7376 if (VA.isMemLoc()) {
7377 HandleMemLoc();
7378 continue;
7379 }
7380 }
7381
7382 // On AIX a minimum of 8 words is saved to the parameter save area.
7383 const unsigned MinParameterSaveArea = 8 * PtrByteSize;
7384 // Area that is at least reserved in the caller of this function.
7385 unsigned CallerReservedArea = std::max<unsigned>(
7386 a: CCInfo.getStackSize(), b: LinkageSize + MinParameterSaveArea);
7387
7388 // Set the size that is at least reserved in caller of this function. Tail
7389 // call optimized function's reserved stack space needs to be aligned so
7390 // that taking the difference between two stack areas will result in an
7391 // aligned stack.
7392 CallerReservedArea =
7393 EnsureStackAlignment(Lowering: Subtarget.getFrameLowering(), NumBytes: CallerReservedArea);
7394 FuncInfo->setMinReservedArea(CallerReservedArea);
7395
7396 if (isVarArg) {
7397 int VAListIndex = 0;
7398 // If any of the optional arguments are passed in register then the fixed
7399 // stack object we spill into is not immutable. Create a fixed stack object
7400 // that overlaps the remainder of the parameter save area.
7401 if (CCInfo.getStackSize() < (LinkageSize + MinParameterSaveArea)) {
7402 unsigned FixedStackSize =
7403 LinkageSize + MinParameterSaveArea - CCInfo.getStackSize();
7404 VAListIndex =
7405 MFI.CreateFixedObject(Size: FixedStackSize, SPOffset: CCInfo.getStackSize(),
7406 /* IsImmutable */ false, /* IsAliased */ isAliased: true);
7407 } else {
7408 // All the arguments passed through ellipses are on the stack. Create a
7409 // dummy fixed stack object the same size as a pointer since we don't
7410 // know the actual size.
7411 VAListIndex =
7412 MFI.CreateFixedObject(Size: PtrByteSize, SPOffset: CCInfo.getStackSize(),
7413 /* IsImmutable */ true, /* IsAliased */ isAliased: true);
7414 }
7415
7416 FuncInfo->setVarArgsFrameIndex(VAListIndex);
7417 SDValue FIN = DAG.getFrameIndex(FI: VAListIndex, VT: PtrVT);
7418
7419 static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
7420 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
7421
7422 static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
7423 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
7424 const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32);
7425
7426 // The fixed integer arguments of a variadic function are stored to the
7427 // VarArgsFrameIndex on the stack so that they may be loaded by
7428 // dereferencing the result of va_next.
7429 for (unsigned
7430 GPRIndex = (CCInfo.getStackSize() - LinkageSize) / PtrByteSize,
7431 Offset = 0;
7432 GPRIndex < NumGPArgRegs; ++GPRIndex, Offset += PtrByteSize) {
7433
7434 const Register VReg =
7435 IsPPC64 ? MF.addLiveIn(PReg: GPR_64[GPRIndex], RC: &PPC::G8RCRegClass)
7436 : MF.addLiveIn(PReg: GPR_32[GPRIndex], RC: &PPC::GPRCRegClass);
7437
7438 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: VReg, VT: PtrVT);
7439 MachinePointerInfo MPI =
7440 MachinePointerInfo::getFixedStack(MF, FI: VAListIndex, Offset);
7441 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN, PtrInfo: MPI);
7442 MemOps.push_back(Elt: Store);
7443 // Increment the address for the next argument to store.
7444 SDValue PtrOff = DAG.getConstant(Val: PtrByteSize, DL: dl, VT: PtrVT);
7445 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrOff.getValueType(), N1: FIN, N2: PtrOff);
7446 }
7447 }
7448
7449 if (!MemOps.empty())
7450 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOps);
7451
7452 return Chain;
7453}
7454
7455SDValue PPCTargetLowering::LowerCall_AIX(
7456 SDValue Chain, SDValue Callee, CallFlags CFlags,
7457 const SmallVectorImpl<ISD::OutputArg> &Outs,
7458 const SmallVectorImpl<SDValue> &OutVals,
7459 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7460 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
7461 const CallBase *CB) const {
7462 // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
7463 // AIX ABI stack frame layout.
7464
7465 assert((CFlags.CallConv == CallingConv::C ||
7466 CFlags.CallConv == CallingConv::Cold ||
7467 CFlags.CallConv == CallingConv::Fast) &&
7468 "Unexpected calling convention!");
7469
7470 if (CFlags.IsPatchPoint)
7471 report_fatal_error(reason: "This call type is unimplemented on AIX.");
7472
7473 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7474
7475 MachineFunction &MF = DAG.getMachineFunction();
7476 SmallVector<CCValAssign, 16> ArgLocs;
7477 CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
7478 *DAG.getContext());
7479
7480 // Reserve space for the linkage save area (LSA) on the stack.
7481 // In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
7482 // [SP][CR][LR][2 x reserved][TOC].
7483 // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
7484 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7485 const bool IsPPC64 = Subtarget.isPPC64();
7486 const EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7487 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7488 CCInfo.AllocateStack(Size: LinkageSize, Alignment: Align(PtrByteSize));
7489 CCInfo.AnalyzeCallOperands(Outs, Fn: CC_AIX);
7490
7491 // The prolog code of the callee may store up to 8 GPR argument registers to
7492 // the stack, allowing va_start to index over them in memory if the callee
7493 // is variadic.
7494 // Because we cannot tell if this is needed on the caller side, we have to
7495 // conservatively assume that it is needed. As such, make sure we have at
7496 // least enough stack space for the caller to store the 8 GPRs.
7497 const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
7498 const unsigned NumBytes = std::max<unsigned>(
7499 a: LinkageSize + MinParameterSaveAreaSize, b: CCInfo.getStackSize());
7500
7501 // Adjust the stack pointer for the new arguments...
7502 // These operations are automatically eliminated by the prolog/epilog pass.
7503 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytes, OutSize: 0, DL: dl);
7504 SDValue CallSeqStart = Chain;
7505
7506 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7507 SmallVector<SDValue, 8> MemOpChains;
7508
7509 // Set up a copy of the stack pointer for loading and storing any
7510 // arguments that may not fit in the registers available for argument
7511 // passing.
7512 const SDValue StackPtr = IsPPC64 ? DAG.getRegister(Reg: PPC::X1, VT: MVT::i64)
7513 : DAG.getRegister(Reg: PPC::R1, VT: MVT::i32);
7514
7515 for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
7516 const unsigned ValNo = ArgLocs[I].getValNo();
7517 SDValue Arg = OutVals[ValNo];
7518 ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
7519
7520 if (Flags.isByVal()) {
7521 const unsigned ByValSize = Flags.getByValSize();
7522
7523 // Nothing to do for zero-sized ByVals on the caller side.
7524 if (!ByValSize) {
7525 ++I;
7526 continue;
7527 }
7528
7529 auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
7530 return DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: PtrVT, Chain,
7531 Ptr: (LoadOffset != 0)
7532 ? DAG.getObjectPtrOffset(
7533 SL: dl, Ptr: Arg, Offset: TypeSize::getFixed(ExactSize: LoadOffset))
7534 : Arg,
7535 PtrInfo: MachinePointerInfo(), MemVT: VT);
7536 };
7537
7538 unsigned LoadOffset = 0;
7539
7540 // Initialize registers, which are fully occupied by the by-val argument.
7541 while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
7542 SDValue Load = GetLoad(PtrVT, LoadOffset);
7543 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7544 LoadOffset += PtrByteSize;
7545 const CCValAssign &ByValVA = ArgLocs[I++];
7546 assert(ByValVA.getValNo() == ValNo &&
7547 "Unexpected location for pass-by-value argument.");
7548 RegsToPass.push_back(Elt: std::make_pair(x: ByValVA.getLocReg(), y&: Load));
7549 }
7550
7551 if (LoadOffset == ByValSize)
7552 continue;
7553
7554 // There must be one more loc to handle the remainder.
7555 assert(ArgLocs[I].getValNo() == ValNo &&
7556 "Expected additional location for by-value argument.");
7557
7558 if (ArgLocs[I].isMemLoc()) {
7559 assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
7560 const CCValAssign &ByValVA = ArgLocs[I++];
7561 ISD::ArgFlagsTy MemcpyFlags = Flags;
7562 // Only memcpy the bytes that don't pass in register.
7563 MemcpyFlags.setByValSize(ByValSize - LoadOffset);
7564 Chain = CallSeqStart = createMemcpyOutsideCallSeq(
7565 Arg: (LoadOffset != 0) ? DAG.getObjectPtrOffset(
7566 SL: dl, Ptr: Arg, Offset: TypeSize::getFixed(ExactSize: LoadOffset))
7567 : Arg,
7568 PtrOff: DAG.getObjectPtrOffset(
7569 SL: dl, Ptr: StackPtr, Offset: TypeSize::getFixed(ExactSize: ByValVA.getLocMemOffset())),
7570 CallSeqStart, Flags: MemcpyFlags, DAG, dl);
7571 continue;
7572 }
7573
7574 // Initialize the final register residue.
7575 // Any residue that occupies the final by-val arg register must be
7576 // left-justified on AIX. Loads must be a power-of-2 size and cannot be
7577 // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
7578 // 2 and 1 byte loads.
7579 const unsigned ResidueBytes = ByValSize % PtrByteSize;
7580 assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
7581 "Unexpected register residue for by-value argument.");
7582 SDValue ResidueVal;
7583 for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
7584 const unsigned N = llvm::bit_floor(Value: ResidueBytes - Bytes);
7585 const MVT VT =
7586 N == 1 ? MVT::i8
7587 : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
7588 SDValue Load = GetLoad(VT, LoadOffset);
7589 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7590 LoadOffset += N;
7591 Bytes += N;
7592
7593 // By-val arguments are passed left-justfied in register.
7594 // Every load here needs to be shifted, otherwise a full register load
7595 // should have been used.
7596 assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
7597 "Unexpected load emitted during handling of pass-by-value "
7598 "argument.");
7599 unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
7600 EVT ShiftAmountTy =
7601 getShiftAmountTy(LHSTy: Load->getValueType(ResNo: 0), DL: DAG.getDataLayout());
7602 SDValue SHLAmt = DAG.getConstant(Val: NumSHLBits, DL: dl, VT: ShiftAmountTy);
7603 SDValue ShiftedLoad =
7604 DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: Load.getValueType(), N1: Load, N2: SHLAmt);
7605 ResidueVal = ResidueVal ? DAG.getNode(Opcode: ISD::OR, DL: dl, VT: PtrVT, N1: ResidueVal,
7606 N2: ShiftedLoad)
7607 : ShiftedLoad;
7608 }
7609
7610 const CCValAssign &ByValVA = ArgLocs[I++];
7611 RegsToPass.push_back(Elt: std::make_pair(x: ByValVA.getLocReg(), y&: ResidueVal));
7612 continue;
7613 }
7614
7615 CCValAssign &VA = ArgLocs[I++];
7616 const MVT LocVT = VA.getLocVT();
7617 const MVT ValVT = VA.getValVT();
7618
7619 switch (VA.getLocInfo()) {
7620 default:
7621 report_fatal_error(reason: "Unexpected argument extension type.");
7622 case CCValAssign::Full:
7623 break;
7624 case CCValAssign::ZExt:
7625 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7626 break;
7627 case CCValAssign::SExt:
7628 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7629 break;
7630 }
7631
7632 if (VA.isRegLoc() && !VA.needsCustom()) {
7633 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
7634 continue;
7635 }
7636
7637 // Vector arguments passed to VarArg functions need custom handling when
7638 // they are passed (at least partially) in GPRs.
7639 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {
7640 assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");
7641 // Store value to its stack slot.
7642 SDValue PtrOff =
7643 DAG.getConstant(Val: VA.getLocMemOffset(), DL: dl, VT: StackPtr.getValueType());
7644 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7645 SDValue Store =
7646 DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff, PtrInfo: MachinePointerInfo());
7647 MemOpChains.push_back(Elt: Store);
7648 const unsigned OriginalValNo = VA.getValNo();
7649 // Then load the GPRs from the stack
7650 unsigned LoadOffset = 0;
7651 auto HandleCustomVecRegLoc = [&]() {
7652 assert(I != E && "Unexpected end of CCvalAssigns.");
7653 assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7654 "Expected custom RegLoc.");
7655 CCValAssign RegVA = ArgLocs[I++];
7656 assert(RegVA.getValNo() == OriginalValNo &&
7657 "Custom MemLoc ValNo and custom RegLoc ValNo must match.");
7658 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: PtrOff,
7659 N2: DAG.getConstant(Val: LoadOffset, DL: dl, VT: PtrVT));
7660 SDValue Load = DAG.getLoad(VT: PtrVT, dl, Chain: Store, Ptr: Add, PtrInfo: MachinePointerInfo());
7661 MemOpChains.push_back(Elt: Load.getValue(R: 1));
7662 RegsToPass.push_back(Elt: std::make_pair(x: RegVA.getLocReg(), y&: Load));
7663 LoadOffset += PtrByteSize;
7664 };
7665
7666 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7667 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7668 // R10.
7669 HandleCustomVecRegLoc();
7670 HandleCustomVecRegLoc();
7671
7672 if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7673 ArgLocs[I].getValNo() == OriginalValNo) {
7674 assert(!IsPPC64 &&
7675 "Only 2 custom RegLocs expected for 64-bit codegen.");
7676 HandleCustomVecRegLoc();
7677 HandleCustomVecRegLoc();
7678 }
7679
7680 continue;
7681 }
7682
7683 if (VA.isMemLoc()) {
7684 SDValue PtrOff =
7685 DAG.getConstant(Val: VA.getLocMemOffset(), DL: dl, VT: StackPtr.getValueType());
7686 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7687 MemOpChains.push_back(
7688 Elt: DAG.getStore(Chain, dl, Val: Arg, Ptr: PtrOff,
7689 PtrInfo: MachinePointerInfo::getStack(MF, Offset: VA.getLocMemOffset()),
7690 Alignment: Subtarget.getFrameLowering()->getStackAlign()));
7691
7692 continue;
7693 }
7694
7695 if (!ValVT.isFloatingPoint())
7696 report_fatal_error(
7697 reason: "Unexpected register handling for calling convention.");
7698
7699 // Custom handling is used for GPR initializations for vararg float
7700 // arguments.
7701 assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
7702 LocVT.isInteger() &&
7703 "Custom register handling only expected for VarArg.");
7704
7705 SDValue ArgAsInt =
7706 DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: ValVT.getSizeInBits()), V: Arg);
7707
7708 if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
7709 // f32 in 32-bit GPR
7710 // f64 in 64-bit GPR
7711 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ArgAsInt));
7712 else if (Arg.getValueType().getFixedSizeInBits() <
7713 LocVT.getFixedSizeInBits())
7714 // f32 in 64-bit GPR.
7715 RegsToPass.push_back(Elt: std::make_pair(
7716 x: VA.getLocReg(), y: DAG.getZExtOrTrunc(Op: ArgAsInt, DL: dl, VT: LocVT)));
7717 else {
7718 // f64 in two 32-bit GPRs
7719 // The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
7720 assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
7721 "Unexpected custom register for argument!");
7722 CCValAssign &GPR1 = VA;
7723 SDValue MSWAsI64 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: ArgAsInt,
7724 N2: DAG.getConstant(Val: 32, DL: dl, VT: MVT::i8));
7725 RegsToPass.push_back(Elt: std::make_pair(
7726 x: GPR1.getLocReg(), y: DAG.getZExtOrTrunc(Op: MSWAsI64, DL: dl, VT: MVT::i32)));
7727
7728 if (I != E) {
7729 // If only 1 GPR was available, there will only be one custom GPR and
7730 // the argument will also pass in memory.
7731 CCValAssign &PeekArg = ArgLocs[I];
7732 if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
7733 assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
7734 CCValAssign &GPR2 = ArgLocs[I++];
7735 RegsToPass.push_back(Elt: std::make_pair(
7736 x: GPR2.getLocReg(), y: DAG.getZExtOrTrunc(Op: ArgAsInt, DL: dl, VT: MVT::i32)));
7737 }
7738 }
7739 }
7740 }
7741
7742 if (!MemOpChains.empty())
7743 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
7744
7745 // For indirect calls, we need to save the TOC base to the stack for
7746 // restoration after the call.
7747 if (CFlags.IsIndirect) {
7748 assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
7749 const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
7750 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
7751 const MVT PtrVT = Subtarget.getScalarIntVT();
7752 const unsigned TOCSaveOffset =
7753 Subtarget.getFrameLowering()->getTOCSaveOffset();
7754
7755 setUsesTOCBasePtr(DAG);
7756 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: TOCBaseReg, VT: PtrVT);
7757 SDValue PtrOff = DAG.getIntPtrConstant(Val: TOCSaveOffset, DL: dl);
7758 SDValue StackPtr = DAG.getRegister(Reg: StackPtrReg, VT: PtrVT);
7759 SDValue AddPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackPtr, N2: PtrOff);
7760 Chain = DAG.getStore(
7761 Chain: Val.getValue(R: 1), dl, Val, Ptr: AddPtr,
7762 PtrInfo: MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset: TOCSaveOffset));
7763 }
7764
7765 // Build a sequence of copy-to-reg nodes chained together with token chain
7766 // and flag operands which copy the outgoing args into the appropriate regs.
7767 SDValue InGlue;
7768 for (auto Reg : RegsToPass) {
7769 Chain = DAG.getCopyToReg(Chain, dl, Reg: Reg.first, N: Reg.second, Glue: InGlue);
7770 InGlue = Chain.getValue(R: 1);
7771 }
7772
7773 const int SPDiff = 0;
7774 return FinishCall(CFlags, dl, DAG, RegsToPass, Glue: InGlue, Chain, CallSeqStart,
7775 Callee, SPDiff, NumBytes, Ins, InVals, CB);
7776}
7777
7778bool
7779PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
7780 MachineFunction &MF, bool isVarArg,
7781 const SmallVectorImpl<ISD::OutputArg> &Outs,
7782 LLVMContext &Context,
7783 const Type *RetTy) const {
7784 SmallVector<CCValAssign, 16> RVLocs;
7785 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7786 return CCInfo.CheckReturn(
7787 Outs, Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7788 ? RetCC_PPC_Cold
7789 : RetCC_PPC);
7790}
7791
7792SDValue
7793PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7794 bool isVarArg,
7795 const SmallVectorImpl<ISD::OutputArg> &Outs,
7796 const SmallVectorImpl<SDValue> &OutVals,
7797 const SDLoc &dl, SelectionDAG &DAG) const {
7798 SmallVector<CCValAssign, 16> RVLocs;
7799 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
7800 *DAG.getContext());
7801 CCInfo.AnalyzeReturn(Outs,
7802 Fn: (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7803 ? RetCC_PPC_Cold
7804 : RetCC_PPC);
7805
7806 SDValue Glue;
7807 SmallVector<SDValue, 4> RetOps(1, Chain);
7808
7809 // Copy the result values into the output registers.
7810 for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
7811 CCValAssign &VA = RVLocs[i];
7812 assert(VA.isRegLoc() && "Can only return in registers!");
7813
7814 SDValue Arg = OutVals[RealResIdx];
7815
7816 switch (VA.getLocInfo()) {
7817 default: llvm_unreachable("Unknown loc info!");
7818 case CCValAssign::Full: break;
7819 case CCValAssign::AExt:
7820 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7821 break;
7822 case CCValAssign::ZExt:
7823 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7824 break;
7825 case CCValAssign::SExt:
7826 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
7827 break;
7828 }
7829 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
7830 bool isLittleEndian = Subtarget.isLittleEndian();
7831 // Legalize ret f64 -> ret 2 x i32.
7832 SDValue SVal =
7833 DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
7834 N2: DAG.getIntPtrConstant(Val: isLittleEndian ? 0 : 1, DL: dl));
7835 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: SVal, Glue);
7836 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
7837 SVal = DAG.getNode(Opcode: PPCISD::EXTRACT_SPE, DL: dl, VT: MVT::i32, N1: Arg,
7838 N2: DAG.getIntPtrConstant(Val: isLittleEndian ? 1 : 0, DL: dl));
7839 Glue = Chain.getValue(R: 1);
7840 VA = RVLocs[++i]; // skip ahead to next loc
7841 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: SVal, Glue);
7842 } else
7843 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: Arg, Glue);
7844 Glue = Chain.getValue(R: 1);
7845 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
7846 }
7847
7848 RetOps[0] = Chain; // Update chain.
7849
7850 // Add the glue if we have it.
7851 if (Glue.getNode())
7852 RetOps.push_back(Elt: Glue);
7853
7854 return DAG.getNode(Opcode: PPCISD::RET_GLUE, DL: dl, VT: MVT::Other, Ops: RetOps);
7855}
7856
7857SDValue
7858PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
7859 SelectionDAG &DAG) const {
7860 SDLoc dl(Op);
7861
7862 // Get the correct type for integers.
7863 EVT IntVT = Op.getValueType();
7864
7865 // Get the inputs.
7866 SDValue Chain = Op.getOperand(i: 0);
7867 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7868 // Build a DYNAREAOFFSET node.
7869 SDValue Ops[2] = {Chain, FPSIdx};
7870 SDVTList VTs = DAG.getVTList(VT: IntVT);
7871 return DAG.getNode(Opcode: PPCISD::DYNAREAOFFSET, DL: dl, VTList: VTs, Ops);
7872}
7873
7874SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
7875 SelectionDAG &DAG) const {
7876 // When we pop the dynamic allocation we need to restore the SP link.
7877 SDLoc dl(Op);
7878
7879 // Get the correct type for pointers.
7880 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7881
7882 // Construct the stack pointer operand.
7883 bool isPPC64 = Subtarget.isPPC64();
7884 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
7885 SDValue StackPtr = DAG.getRegister(Reg: SP, VT: PtrVT);
7886
7887 // Get the operands for the STACKRESTORE.
7888 SDValue Chain = Op.getOperand(i: 0);
7889 SDValue SaveSP = Op.getOperand(i: 1);
7890
7891 // Load the old link SP.
7892 SDValue LoadLinkSP =
7893 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: StackPtr, PtrInfo: MachinePointerInfo());
7894
7895 // Restore the stack pointer.
7896 Chain = DAG.getCopyToReg(Chain: LoadLinkSP.getValue(R: 1), dl, Reg: SP, N: SaveSP);
7897
7898 // Store the old link SP.
7899 return DAG.getStore(Chain, dl, Val: LoadLinkSP, Ptr: StackPtr, PtrInfo: MachinePointerInfo());
7900}
7901
7902SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
7903 MachineFunction &MF = DAG.getMachineFunction();
7904 bool isPPC64 = Subtarget.isPPC64();
7905 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7906
7907 // Get current frame pointer save index. The users of this index will be
7908 // primarily DYNALLOC instructions.
7909 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7910 int RASI = FI->getReturnAddrSaveIndex();
7911
7912 // If the frame pointer save index hasn't been defined yet.
7913 if (!RASI) {
7914 // Find out what the fix offset of the frame pointer save area.
7915 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
7916 // Allocate the frame index for frame pointer save area.
7917 RASI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64? 8 : 4, SPOffset: LROffset, IsImmutable: false);
7918 // Save the result.
7919 FI->setReturnAddrSaveIndex(RASI);
7920 }
7921 return DAG.getFrameIndex(FI: RASI, VT: PtrVT);
7922}
7923
7924SDValue
7925PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
7926 MachineFunction &MF = DAG.getMachineFunction();
7927 bool isPPC64 = Subtarget.isPPC64();
7928 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
7929
7930 // Get current frame pointer save index. The users of this index will be
7931 // primarily DYNALLOC instructions.
7932 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7933 int FPSI = FI->getFramePointerSaveIndex();
7934
7935 // If the frame pointer save index hasn't been defined yet.
7936 if (!FPSI) {
7937 // Find out what the fix offset of the frame pointer save area.
7938 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
7939 // Allocate the frame index for frame pointer save area.
7940 FPSI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64? 8 : 4, SPOffset: FPOffset, IsImmutable: true);
7941 // Save the result.
7942 FI->setFramePointerSaveIndex(FPSI);
7943 }
7944 return DAG.getFrameIndex(FI: FPSI, VT: PtrVT);
7945}
7946
7947SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
7948 SelectionDAG &DAG) const {
7949 MachineFunction &MF = DAG.getMachineFunction();
7950 // Get the inputs.
7951 SDValue Chain = Op.getOperand(i: 0);
7952 SDValue Size = Op.getOperand(i: 1);
7953 SDLoc dl(Op);
7954
7955 // Get the correct type for pointers.
7956 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7957 // Negate the size.
7958 SDValue NegSize = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: PtrVT,
7959 N1: DAG.getConstant(Val: 0, DL: dl, VT: PtrVT), N2: Size);
7960 // Construct a node for the frame pointer save index.
7961 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7962 SDValue Ops[3] = { Chain, NegSize, FPSIdx };
7963 SDVTList VTs = DAG.getVTList(VT1: PtrVT, VT2: MVT::Other);
7964 if (hasInlineStackProbe(MF))
7965 return DAG.getNode(Opcode: PPCISD::PROBED_ALLOCA, DL: dl, VTList: VTs, Ops);
7966 return DAG.getNode(Opcode: PPCISD::DYNALLOC, DL: dl, VTList: VTs, Ops);
7967}
7968
7969SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
7970 SelectionDAG &DAG) const {
7971 MachineFunction &MF = DAG.getMachineFunction();
7972
7973 bool isPPC64 = Subtarget.isPPC64();
7974 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7975
7976 int FI = MF.getFrameInfo().CreateFixedObject(Size: isPPC64 ? 8 : 4, SPOffset: 0, IsImmutable: false);
7977 return DAG.getFrameIndex(FI, VT: PtrVT);
7978}
7979
7980SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
7981 SelectionDAG &DAG) const {
7982 SDLoc DL(Op);
7983 return DAG.getNode(Opcode: PPCISD::EH_SJLJ_SETJMP, DL,
7984 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other),
7985 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
7986}
7987
7988SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
7989 SelectionDAG &DAG) const {
7990 SDLoc DL(Op);
7991 return DAG.getNode(Opcode: PPCISD::EH_SJLJ_LONGJMP, DL, VT: MVT::Other,
7992 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
7993}
7994
7995SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
7996 if (Op.getValueType().isVector())
7997 return LowerVectorLoad(Op, DAG);
7998
7999 assert(Op.getValueType() == MVT::i1 &&
8000 "Custom lowering only for i1 loads");
8001
8002 // First, load 8 bits into 32 bits, then truncate to 1 bit.
8003
8004 SDLoc dl(Op);
8005 LoadSDNode *LD = cast<LoadSDNode>(Val&: Op);
8006
8007 SDValue Chain = LD->getChain();
8008 SDValue BasePtr = LD->getBasePtr();
8009 MachineMemOperand *MMO = LD->getMemOperand();
8010
8011 SDValue NewLD =
8012 DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: getPointerTy(DL: DAG.getDataLayout()), Chain,
8013 Ptr: BasePtr, MemVT: MVT::i8, MMO);
8014 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: NewLD);
8015
8016 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
8017 return DAG.getMergeValues(Ops, dl);
8018}
8019
8020SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
8021 if (Op.getOperand(i: 1).getValueType().isVector())
8022 return LowerVectorStore(Op, DAG);
8023
8024 assert(Op.getOperand(1).getValueType() == MVT::i1 &&
8025 "Custom lowering only for i1 stores");
8026
8027 // First, zero extend to 32 bits, then use a truncating store to 8 bits.
8028
8029 SDLoc dl(Op);
8030 StoreSDNode *ST = cast<StoreSDNode>(Val&: Op);
8031
8032 SDValue Chain = ST->getChain();
8033 SDValue BasePtr = ST->getBasePtr();
8034 SDValue Value = ST->getValue();
8035 MachineMemOperand *MMO = ST->getMemOperand();
8036
8037 Value = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
8038 Operand: Value);
8039 return DAG.getTruncStore(Chain, dl, Val: Value, Ptr: BasePtr, SVT: MVT::i8, MMO);
8040}
8041
8042// FIXME: Remove this once the ANDI glue bug is fixed:
8043SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8044 assert(Op.getValueType() == MVT::i1 &&
8045 "Custom lowering only for i1 results");
8046
8047 SDLoc DL(Op);
8048 return DAG.getNode(Opcode: PPCISD::ANDI_rec_1_GT_BIT, DL, VT: MVT::i1, Operand: Op.getOperand(i: 0));
8049}
8050
8051SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
8052 SelectionDAG &DAG) const {
8053
8054 // Implements a vector truncate that fits in a vector register as a shuffle.
8055 // We want to legalize vector truncates down to where the source fits in
8056 // a vector register (and target is therefore smaller than vector register
8057 // size). At that point legalization will try to custom lower the sub-legal
8058 // result and get here - where we can contain the truncate as a single target
8059 // operation.
8060
8061 // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
8062 // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
8063 //
8064 // We will implement it for big-endian ordering as this (where x denotes
8065 // undefined):
8066 // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
8067 // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
8068 //
8069 // The same operation in little-endian ordering will be:
8070 // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
8071 // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
8072
8073 EVT TrgVT = Op.getValueType();
8074 assert(TrgVT.isVector() && "Vector type expected.");
8075 unsigned TrgNumElts = TrgVT.getVectorNumElements();
8076 EVT EltVT = TrgVT.getVectorElementType();
8077 if (!isOperationCustom(Op: Op.getOpcode(), VT: TrgVT) ||
8078 TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(Value: TrgNumElts) ||
8079 !llvm::has_single_bit<uint32_t>(Value: EltVT.getSizeInBits()))
8080 return SDValue();
8081
8082 SDValue N1 = Op.getOperand(i: 0);
8083 EVT SrcVT = N1.getValueType();
8084 unsigned SrcSize = SrcVT.getSizeInBits();
8085 if (SrcSize > 256 || !isPowerOf2_32(Value: SrcVT.getVectorNumElements()) ||
8086 !llvm::has_single_bit<uint32_t>(
8087 Value: SrcVT.getVectorElementType().getSizeInBits()))
8088 return SDValue();
8089 if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
8090 return SDValue();
8091
8092 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8093 EVT WideVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: WideNumElts);
8094
8095 SDLoc DL(Op);
8096 SDValue Op1, Op2;
8097 if (SrcSize == 256) {
8098 EVT VecIdxTy = getVectorIdxTy(DL: DAG.getDataLayout());
8099 EVT SplitVT =
8100 N1.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
8101 unsigned SplitNumElts = SplitVT.getVectorNumElements();
8102 Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SplitVT, N1,
8103 N2: DAG.getConstant(Val: 0, DL, VT: VecIdxTy));
8104 Op2 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SplitVT, N1,
8105 N2: DAG.getConstant(Val: SplitNumElts, DL, VT: VecIdxTy));
8106 }
8107 else {
8108 Op1 = SrcSize == 128 ? N1 : widenVec(DAG, Vec: N1, dl: DL);
8109 Op2 = DAG.getUNDEF(VT: WideVT);
8110 }
8111
8112 // First list the elements we want to keep.
8113 unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
8114 SmallVector<int, 16> ShuffV;
8115 if (Subtarget.isLittleEndian())
8116 for (unsigned i = 0; i < TrgNumElts; ++i)
8117 ShuffV.push_back(Elt: i * SizeMult);
8118 else
8119 for (unsigned i = 1; i <= TrgNumElts; ++i)
8120 ShuffV.push_back(Elt: i * SizeMult - 1);
8121
8122 // Populate the remaining elements with undefs.
8123 for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
8124 // ShuffV.push_back(i + WideNumElts);
8125 ShuffV.push_back(Elt: WideNumElts + 1);
8126
8127 Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVT, Operand: Op1);
8128 Op2 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVT, Operand: Op2);
8129 return DAG.getVectorShuffle(VT: WideVT, dl: DL, N1: Op1, N2: Op2, Mask: ShuffV);
8130}
8131
8132/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
8133/// possible.
8134SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
8135 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
8136 EVT ResVT = Op.getValueType();
8137 EVT CmpVT = Op.getOperand(i: 0).getValueType();
8138 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
8139 SDValue TV = Op.getOperand(i: 2), FV = Op.getOperand(i: 3);
8140 SDLoc dl(Op);
8141
8142 // Without power9-vector, we don't have native instruction for f128 comparison.
8143 // Following transformation to libcall is needed for setcc:
8144 // select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE
8145 if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {
8146 SDValue Z = DAG.getSetCC(
8147 DL: dl, VT: getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: CmpVT),
8148 LHS, RHS, Cond: CC);
8149 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: Z.getValueType());
8150 return DAG.getSelectCC(DL: dl, LHS: Z, RHS: Zero, True: TV, False: FV, Cond: ISD::SETNE);
8151 }
8152
8153 // Not FP, or using SPE? Not a fsel.
8154 if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||
8155 Subtarget.hasSPE())
8156 return Op;
8157
8158 SDNodeFlags Flags = Op.getNode()->getFlags();
8159
8160 // We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the
8161 // presence of infinities.
8162 if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
8163 switch (CC) {
8164 default:
8165 break;
8166 case ISD::SETOGT:
8167 case ISD::SETGT:
8168 return DAG.getNode(Opcode: PPCISD::XSMAXC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
8169 case ISD::SETOLT:
8170 case ISD::SETLT:
8171 return DAG.getNode(Opcode: PPCISD::XSMINC, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
8172 }
8173 }
8174
8175 // We might be able to do better than this under some circumstances, but in
8176 // general, fsel-based lowering of select is a finite-math-only optimization.
8177 // For more information, see section F.3 of the 2.06 ISA specification.
8178 // With ISA 3.0
8179 if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
8180 (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()) ||
8181 ResVT == MVT::f128)
8182 return Op;
8183
8184 // If the RHS of the comparison is a 0.0, we don't need to do the
8185 // subtraction at all.
8186 SDValue Sel1;
8187 if (isFloatingPointZero(Op: RHS))
8188 switch (CC) {
8189 default: break; // SETUO etc aren't handled by fsel.
8190 case ISD::SETNE:
8191 std::swap(a&: TV, b&: FV);
8192 [[fallthrough]];
8193 case ISD::SETEQ:
8194 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8195 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8196 Sel1 = DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: LHS, N2: TV, N3: FV);
8197 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8198 Sel1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Sel1);
8199 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8200 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: LHS), N2: Sel1, N3: FV);
8201 case ISD::SETULT:
8202 case ISD::SETLT:
8203 std::swap(a&: TV, b&: FV); // fsel is natively setge, swap operands for setlt
8204 [[fallthrough]];
8205 case ISD::SETOGE:
8206 case ISD::SETGE:
8207 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8208 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8209 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: LHS, N2: TV, N3: FV);
8210 case ISD::SETUGT:
8211 case ISD::SETGT:
8212 std::swap(a&: TV, b&: FV); // fsel is natively setge, swap operands for setlt
8213 [[fallthrough]];
8214 case ISD::SETOLE:
8215 case ISD::SETLE:
8216 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8217 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: LHS);
8218 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8219 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: LHS), N2: TV, N3: FV);
8220 }
8221
8222 SDValue Cmp;
8223 switch (CC) {
8224 default: break; // SETUO etc aren't handled by fsel.
8225 case ISD::SETNE:
8226 std::swap(a&: TV, b&: FV);
8227 [[fallthrough]];
8228 case ISD::SETEQ:
8229 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8230 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8231 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8232 Sel1 = DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8233 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8234 Sel1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Sel1);
8235 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT,
8236 N1: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT: MVT::f64, Operand: Cmp), N2: Sel1, N3: FV);
8237 case ISD::SETULT:
8238 case ISD::SETLT:
8239 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8240 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8241 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8242 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: FV, N3: TV);
8243 case ISD::SETOGE:
8244 case ISD::SETGE:
8245 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: LHS, N2: RHS, Flags);
8246 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8247 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8248 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8249 case ISD::SETUGT:
8250 case ISD::SETGT:
8251 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: RHS, N2: LHS, Flags);
8252 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8253 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8254 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: FV, N3: TV);
8255 case ISD::SETOLE:
8256 case ISD::SETLE:
8257 Cmp = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: CmpVT, N1: RHS, N2: LHS, Flags);
8258 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8259 Cmp = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Cmp);
8260 return DAG.getNode(Opcode: PPCISD::FSEL, DL: dl, VT: ResVT, N1: Cmp, N2: TV, N3: FV);
8261 }
8262 return Op;
8263}
8264
8265static unsigned getPPCStrictOpcode(unsigned Opc) {
8266 switch (Opc) {
8267 default:
8268 llvm_unreachable("No strict version of this opcode!");
8269 case PPCISD::FCTIDZ:
8270 return PPCISD::STRICT_FCTIDZ;
8271 case PPCISD::FCTIWZ:
8272 return PPCISD::STRICT_FCTIWZ;
8273 case PPCISD::FCTIDUZ:
8274 return PPCISD::STRICT_FCTIDUZ;
8275 case PPCISD::FCTIWUZ:
8276 return PPCISD::STRICT_FCTIWUZ;
8277 case PPCISD::FCFID:
8278 return PPCISD::STRICT_FCFID;
8279 case PPCISD::FCFIDU:
8280 return PPCISD::STRICT_FCFIDU;
8281 case PPCISD::FCFIDS:
8282 return PPCISD::STRICT_FCFIDS;
8283 case PPCISD::FCFIDUS:
8284 return PPCISD::STRICT_FCFIDUS;
8285 }
8286}
8287
8288static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
8289 const PPCSubtarget &Subtarget) {
8290 SDLoc dl(Op);
8291 bool IsStrict = Op->isStrictFPOpcode();
8292 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8293 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8294
8295 // TODO: Any other flags to propagate?
8296 SDNodeFlags Flags;
8297 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8298
8299 // For strict nodes, source is the second operand.
8300 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8301 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
8302 MVT DestTy = Op.getSimpleValueType();
8303 assert(Src.getValueType().isFloatingPoint() &&
8304 (DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 ||
8305 DestTy == MVT::i64) &&
8306 "Invalid FP_TO_INT types");
8307 if (Src.getValueType() == MVT::f32) {
8308 if (IsStrict) {
8309 Src =
8310 DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl,
8311 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other), Ops: {Chain, Src}, Flags);
8312 Chain = Src.getValue(R: 1);
8313 } else
8314 Src = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Src);
8315 }
8316 if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector())
8317 DestTy = Subtarget.getScalarIntVT();
8318 unsigned Opc = ISD::DELETED_NODE;
8319 switch (DestTy.SimpleTy) {
8320 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
8321 case MVT::i32:
8322 Opc = IsSigned ? PPCISD::FCTIWZ
8323 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
8324 break;
8325 case MVT::i64:
8326 assert((IsSigned || Subtarget.hasFPCVT()) &&
8327 "i64 FP_TO_UINT is supported only with FPCVT");
8328 Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
8329 }
8330 EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64;
8331 SDValue Conv;
8332 if (IsStrict) {
8333 Opc = getPPCStrictOpcode(Opc);
8334 Conv = DAG.getNode(Opcode: Opc, DL: dl, VTList: DAG.getVTList(VT1: ConvTy, VT2: MVT::Other), Ops: {Chain, Src},
8335 Flags);
8336 } else {
8337 Conv = DAG.getNode(Opcode: Opc, DL: dl, VT: ConvTy, Operand: Src);
8338 }
8339 return Conv;
8340}
8341
8342void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
8343 SelectionDAG &DAG,
8344 const SDLoc &dl) const {
8345 SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
8346 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8347 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8348 bool IsStrict = Op->isStrictFPOpcode();
8349
8350 // Convert the FP value to an int value through memory.
8351 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
8352 (IsSigned || Subtarget.hasFPCVT());
8353 SDValue FIPtr = DAG.CreateStackTemporary(VT: i32Stack ? MVT::i32 : MVT::f64);
8354 int FI = cast<FrameIndexSDNode>(Val&: FIPtr)->getIndex();
8355 MachinePointerInfo MPI =
8356 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI);
8357
8358 // Emit a store to the stack slot.
8359 SDValue Chain = IsStrict ? Tmp.getValue(R: 1) : DAG.getEntryNode();
8360 Align Alignment(DAG.getEVTAlign(MemoryVT: Tmp.getValueType()));
8361 if (i32Stack) {
8362 MachineFunction &MF = DAG.getMachineFunction();
8363 Alignment = Align(4);
8364 MachineMemOperand *MMO =
8365 MF.getMachineMemOperand(PtrInfo: MPI, F: MachineMemOperand::MOStore, Size: 4, BaseAlignment: Alignment);
8366 SDValue Ops[] = { Chain, Tmp, FIPtr };
8367 Chain = DAG.getMemIntrinsicNode(Opcode: PPCISD::STFIWX, dl,
8368 VTList: DAG.getVTList(VT: MVT::Other), Ops, MemVT: MVT::i32, MMO);
8369 } else
8370 Chain = DAG.getStore(Chain, dl, Val: Tmp, Ptr: FIPtr, PtrInfo: MPI, Alignment);
8371
8372 // Result is a load from the stack slot. If loading 4 bytes, make sure to
8373 // add in a bias on big endian.
8374 if (Op.getValueType() == MVT::i32 && !i32Stack &&
8375 !Subtarget.isLittleEndian()) {
8376 FIPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: FIPtr.getValueType(), N1: FIPtr,
8377 N2: DAG.getConstant(Val: 4, DL: dl, VT: FIPtr.getValueType()));
8378 MPI = MPI.getWithOffset(O: 4);
8379 }
8380
8381 RLI.Chain = Chain;
8382 RLI.Ptr = FIPtr;
8383 RLI.MPI = MPI;
8384 RLI.Alignment = Alignment;
8385}
8386
8387/// Custom lowers floating point to integer conversions to use
8388/// the direct move instructions available in ISA 2.07 to avoid the
8389/// need for load/store combinations.
8390SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
8391 SelectionDAG &DAG,
8392 const SDLoc &dl) const {
8393 SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
8394 SDValue Mov = DAG.getNode(Opcode: PPCISD::MFVSR, DL: dl, VT: Op.getValueType(), Operand: Conv);
8395 if (Op->isStrictFPOpcode())
8396 return DAG.getMergeValues(Ops: {Mov, Conv.getValue(R: 1)}, dl);
8397 else
8398 return Mov;
8399}
8400
8401SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
8402 const SDLoc &dl) const {
8403 bool IsStrict = Op->isStrictFPOpcode();
8404 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8405 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8406 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8407 EVT SrcVT = Src.getValueType();
8408 EVT DstVT = Op.getValueType();
8409
8410 // FP to INT conversions are legal for f128.
8411 if (SrcVT == MVT::f128)
8412 return Subtarget.hasP9Vector() ? Op : SDValue();
8413
8414 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
8415 // PPC (the libcall is not available).
8416 if (SrcVT == MVT::ppcf128) {
8417 if (DstVT == MVT::i32) {
8418 // TODO: Conservatively pass only nofpexcept flag here. Need to check and
8419 // set other fast-math flags to FP operations in both strict and
8420 // non-strict cases. (FP_TO_SINT, FSUB)
8421 SDNodeFlags Flags;
8422 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8423
8424 if (IsSigned) {
8425 SDValue Lo, Hi;
8426 std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Src, DL: dl, LoVT: MVT::f64, HiVT: MVT::f64);
8427
8428 // Add the two halves of the long double in round-to-zero mode, and use
8429 // a smaller FP_TO_SINT.
8430 if (IsStrict) {
8431 SDValue Res = DAG.getNode(Opcode: PPCISD::STRICT_FADDRTZ, DL: dl,
8432 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8433 Ops: {Op.getOperand(i: 0), Lo, Hi}, Flags);
8434 return DAG.getNode(Opcode: ISD::STRICT_FP_TO_SINT, DL: dl,
8435 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other),
8436 Ops: {Res.getValue(R: 1), Res}, Flags);
8437 } else {
8438 SDValue Res = DAG.getNode(Opcode: PPCISD::FADDRTZ, DL: dl, VT: MVT::f64, N1: Lo, N2: Hi);
8439 return DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: Res);
8440 }
8441 } else {
8442 const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
8443 APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
8444 SDValue Cst = DAG.getConstantFP(Val: APF, DL: dl, VT: SrcVT);
8445 SDValue SignMask = DAG.getConstant(Val: 0x80000000, DL: dl, VT: DstVT);
8446 if (IsStrict) {
8447 // Sel = Src < 0x80000000
8448 // FltOfs = select Sel, 0.0, 0x80000000
8449 // IntOfs = select Sel, 0, 0x80000000
8450 // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
8451 SDValue Chain = Op.getOperand(i: 0);
8452 EVT SetCCVT =
8453 getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT);
8454 EVT DstSetCCVT =
8455 getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: DstVT);
8456 SDValue Sel = DAG.getSetCC(DL: dl, VT: SetCCVT, LHS: Src, RHS: Cst, Cond: ISD::SETLT,
8457 Chain, IsSignaling: true);
8458 Chain = Sel.getValue(R: 1);
8459
8460 SDValue FltOfs = DAG.getSelect(
8461 DL: dl, VT: SrcVT, Cond: Sel, LHS: DAG.getConstantFP(Val: 0.0, DL: dl, VT: SrcVT), RHS: Cst);
8462 Sel = DAG.getBoolExtOrTrunc(Op: Sel, SL: dl, VT: DstSetCCVT, OpVT: DstVT);
8463
8464 SDValue Val = DAG.getNode(Opcode: ISD::STRICT_FSUB, DL: dl,
8465 VTList: DAG.getVTList(VT1: SrcVT, VT2: MVT::Other),
8466 Ops: {Chain, Src, FltOfs}, Flags);
8467 Chain = Val.getValue(R: 1);
8468 SDValue SInt = DAG.getNode(Opcode: ISD::STRICT_FP_TO_SINT, DL: dl,
8469 VTList: DAG.getVTList(VT1: DstVT, VT2: MVT::Other),
8470 Ops: {Chain, Val}, Flags);
8471 Chain = SInt.getValue(R: 1);
8472 SDValue IntOfs = DAG.getSelect(
8473 DL: dl, VT: DstVT, Cond: Sel, LHS: DAG.getConstant(Val: 0, DL: dl, VT: DstVT), RHS: SignMask);
8474 SDValue Result = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: DstVT, N1: SInt, N2: IntOfs);
8475 return DAG.getMergeValues(Ops: {Result, Chain}, dl);
8476 } else {
8477 // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
8478 // FIXME: generated code sucks.
8479 SDValue True = DAG.getNode(Opcode: ISD::FSUB, DL: dl, VT: MVT::ppcf128, N1: Src, N2: Cst);
8480 True = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: True);
8481 True = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: True, N2: SignMask);
8482 SDValue False = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: dl, VT: MVT::i32, Operand: Src);
8483 return DAG.getSelectCC(DL: dl, LHS: Src, RHS: Cst, True, False, Cond: ISD::SETGE);
8484 }
8485 }
8486 }
8487
8488 return SDValue();
8489 }
8490
8491 if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
8492 return LowerFP_TO_INTDirectMove(Op, DAG, dl);
8493
8494 ReuseLoadInfo RLI;
8495 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8496
8497 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: RLI.Chain, Ptr: RLI.Ptr, PtrInfo: RLI.MPI,
8498 Alignment: RLI.Alignment, MMOFlags: RLI.MMOFlags(), AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8499}
8500
8501// We're trying to insert a regular store, S, and then a load, L. If the
8502// incoming value, O, is a load, we might just be able to have our load use the
8503// address used by O. However, we don't know if anything else will store to
8504// that address before we can load from it. To prevent this situation, we need
8505// to insert our load, L, into the chain as a peer of O. To do this, we give L
8506// the same chain operand as O, we create a token factor from the chain results
8507// of O and L, and we replace all uses of O's chain result with that token
8508// factor (this last part is handled by makeEquivalentMemoryOrdering).
8509bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
8510 ReuseLoadInfo &RLI,
8511 SelectionDAG &DAG,
8512 ISD::LoadExtType ET) const {
8513 // Conservatively skip reusing for constrained FP nodes.
8514 if (Op->isStrictFPOpcode())
8515 return false;
8516
8517 SDLoc dl(Op);
8518 bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
8519 (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
8520 if (ET == ISD::NON_EXTLOAD &&
8521 (ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
8522 isOperationLegalOrCustom(Op: Op.getOpcode(),
8523 VT: Op.getOperand(i: 0).getValueType())) {
8524
8525 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8526 return true;
8527 }
8528
8529 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: Op);
8530 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
8531 LD->isNonTemporal())
8532 return false;
8533 if (LD->getMemoryVT() != MemVT)
8534 return false;
8535
8536 // If the result of the load is an illegal type, then we can't build a
8537 // valid chain for reuse since the legalised loads and token factor node that
8538 // ties the legalised loads together uses a different output chain then the
8539 // illegal load.
8540 if (!isTypeLegal(VT: LD->getValueType(ResNo: 0)))
8541 return false;
8542
8543 RLI.Ptr = LD->getBasePtr();
8544 if (LD->isIndexed() && !LD->getOffset().isUndef()) {
8545 assert(LD->getAddressingMode() == ISD::PRE_INC &&
8546 "Non-pre-inc AM on PPC?");
8547 RLI.Ptr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: RLI.Ptr.getValueType(), N1: RLI.Ptr,
8548 N2: LD->getOffset());
8549 }
8550
8551 RLI.Chain = LD->getChain();
8552 RLI.MPI = LD->getPointerInfo();
8553 RLI.IsDereferenceable = LD->isDereferenceable();
8554 RLI.IsInvariant = LD->isInvariant();
8555 RLI.Alignment = LD->getAlign();
8556 RLI.AAInfo = LD->getAAInfo();
8557 RLI.Ranges = LD->getRanges();
8558
8559 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
8560 return true;
8561}
8562
8563/// Analyze profitability of direct move
8564/// prefer float load to int load plus direct move
8565/// when there is no integer use of int load
8566bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
8567 SDNode *Origin = Op.getOperand(i: Op->isStrictFPOpcode() ? 1 : 0).getNode();
8568 if (Origin->getOpcode() != ISD::LOAD)
8569 return true;
8570
8571 // If there is no LXSIBZX/LXSIHZX, like Power8,
8572 // prefer direct move if the memory size is 1 or 2 bytes.
8573 MachineMemOperand *MMO = cast<LoadSDNode>(Val: Origin)->getMemOperand();
8574 if (!Subtarget.hasP9Vector() &&
8575 (!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
8576 return true;
8577
8578 for (SDUse &Use : Origin->uses()) {
8579
8580 // Only look at the users of the loaded value.
8581 if (Use.getResNo() != 0)
8582 continue;
8583
8584 SDNode *User = Use.getUser();
8585 if (User->getOpcode() != ISD::SINT_TO_FP &&
8586 User->getOpcode() != ISD::UINT_TO_FP &&
8587 User->getOpcode() != ISD::STRICT_SINT_TO_FP &&
8588 User->getOpcode() != ISD::STRICT_UINT_TO_FP)
8589 return true;
8590 }
8591
8592 return false;
8593}
8594
8595static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
8596 const PPCSubtarget &Subtarget,
8597 SDValue Chain = SDValue()) {
8598 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8599 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8600 SDLoc dl(Op);
8601
8602 // TODO: Any other flags to propagate?
8603 SDNodeFlags Flags;
8604 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8605
8606 // If we have FCFIDS, then use it when converting to single-precision.
8607 // Otherwise, convert to double-precision and then round.
8608 bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
8609 unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
8610 : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
8611 EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
8612 if (Op->isStrictFPOpcode()) {
8613 if (!Chain)
8614 Chain = Op.getOperand(i: 0);
8615 return DAG.getNode(Opcode: getPPCStrictOpcode(Opc: ConvOpc), DL: dl,
8616 VTList: DAG.getVTList(VT1: ConvTy, VT2: MVT::Other), Ops: {Chain, Src}, Flags);
8617 } else
8618 return DAG.getNode(Opcode: ConvOpc, DL: dl, VT: ConvTy, Operand: Src);
8619}
8620
8621/// Custom lowers integer to floating point conversions to use
8622/// the direct move instructions available in ISA 2.07 to avoid the
8623/// need for load/store combinations.
8624SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
8625 SelectionDAG &DAG,
8626 const SDLoc &dl) const {
8627 assert((Op.getValueType() == MVT::f32 ||
8628 Op.getValueType() == MVT::f64) &&
8629 "Invalid floating point type as target of conversion");
8630 assert(Subtarget.hasFPCVT() &&
8631 "Int to FP conversions with direct moves require FPCVT");
8632 SDValue Src = Op.getOperand(i: Op->isStrictFPOpcode() ? 1 : 0);
8633 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
8634 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
8635 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8636 unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
8637 SDValue Mov = DAG.getNode(Opcode: MovOpc, DL: dl, VT: MVT::f64, Operand: Src);
8638 return convertIntToFP(Op, Src: Mov, DAG, Subtarget);
8639}
8640
8641static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
8642
8643 EVT VecVT = Vec.getValueType();
8644 assert(VecVT.isVector() && "Expected a vector type.");
8645 assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
8646
8647 EVT EltVT = VecVT.getVectorElementType();
8648 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8649 EVT WideVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: WideNumElts);
8650
8651 unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
8652 SmallVector<SDValue, 16> Ops(NumConcat);
8653 Ops[0] = Vec;
8654 SDValue UndefVec = DAG.getUNDEF(VT: VecVT);
8655 for (unsigned i = 1; i < NumConcat; ++i)
8656 Ops[i] = UndefVec;
8657
8658 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: WideVT, Ops);
8659}
8660
8661SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
8662 const SDLoc &dl) const {
8663 bool IsStrict = Op->isStrictFPOpcode();
8664 unsigned Opc = Op.getOpcode();
8665 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8666 assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
8667 Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
8668 "Unexpected conversion type");
8669 assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
8670 "Supports conversions to v2f64/v4f32 only.");
8671
8672 // TODO: Any other flags to propagate?
8673 SDNodeFlags Flags;
8674 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8675
8676 bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
8677 bool FourEltRes = Op.getValueType() == MVT::v4f32;
8678
8679 SDValue Wide = widenVec(DAG, Vec: Src, dl);
8680 EVT WideVT = Wide.getValueType();
8681 unsigned WideNumElts = WideVT.getVectorNumElements();
8682 MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
8683
8684 SmallVector<int, 16> ShuffV;
8685 for (unsigned i = 0; i < WideNumElts; ++i)
8686 ShuffV.push_back(Elt: i + WideNumElts);
8687
8688 int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
8689 int SaveElts = FourEltRes ? 4 : 2;
8690 if (Subtarget.isLittleEndian())
8691 for (int i = 0; i < SaveElts; i++)
8692 ShuffV[i * Stride] = i;
8693 else
8694 for (int i = 1; i <= SaveElts; i++)
8695 ShuffV[i * Stride - 1] = i - 1;
8696
8697 SDValue ShuffleSrc2 =
8698 SignedConv ? DAG.getUNDEF(VT: WideVT) : DAG.getConstant(Val: 0, DL: dl, VT: WideVT);
8699 SDValue Arrange = DAG.getVectorShuffle(VT: WideVT, dl, N1: Wide, N2: ShuffleSrc2, Mask: ShuffV);
8700
8701 SDValue Extend;
8702 if (SignedConv) {
8703 Arrange = DAG.getBitcast(VT: IntermediateVT, V: Arrange);
8704 EVT ExtVT = Src.getValueType();
8705 if (Subtarget.hasP9Altivec())
8706 ExtVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: WideVT.getVectorElementType(),
8707 NumElements: IntermediateVT.getVectorNumElements());
8708
8709 Extend = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT: IntermediateVT, N1: Arrange,
8710 N2: DAG.getValueType(ExtVT));
8711 } else
8712 Extend = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntermediateVT, Operand: Arrange);
8713
8714 if (IsStrict)
8715 return DAG.getNode(Opcode: Opc, DL: dl, VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::Other),
8716 Ops: {Op.getOperand(i: 0), Extend}, Flags);
8717
8718 return DAG.getNode(Opcode: Opc, DL: dl, VT: Op.getValueType(), Operand: Extend);
8719}
8720
8721SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
8722 SelectionDAG &DAG) const {
8723 SDLoc dl(Op);
8724 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8725 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8726 bool IsStrict = Op->isStrictFPOpcode();
8727 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
8728 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : DAG.getEntryNode();
8729
8730 // TODO: Any other flags to propagate?
8731 SDNodeFlags Flags;
8732 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8733
8734 EVT InVT = Src.getValueType();
8735 EVT OutVT = Op.getValueType();
8736 if (OutVT.isVector() && OutVT.isFloatingPoint() &&
8737 isOperationCustom(Op: Op.getOpcode(), VT: InVT))
8738 return LowerINT_TO_FPVector(Op, DAG, dl);
8739
8740 // Conversions to f128 are legal.
8741 if (Op.getValueType() == MVT::f128)
8742 return Subtarget.hasP9Vector() ? Op : SDValue();
8743
8744 // Don't handle ppc_fp128 here; let it be lowered to a libcall.
8745 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
8746 return SDValue();
8747
8748 if (Src.getValueType() == MVT::i1) {
8749 SDValue Sel = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: Op.getValueType(), N1: Src,
8750 N2: DAG.getConstantFP(Val: 1.0, DL: dl, VT: Op.getValueType()),
8751 N3: DAG.getConstantFP(Val: 0.0, DL: dl, VT: Op.getValueType()));
8752 if (IsStrict)
8753 return DAG.getMergeValues(Ops: {Sel, Chain}, dl);
8754 else
8755 return Sel;
8756 }
8757
8758 // If we have direct moves, we can do all the conversion, skip the store/load
8759 // however, without FPCVT we can't do most conversions.
8760 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
8761 Subtarget.isPPC64() && Subtarget.hasFPCVT())
8762 return LowerINT_TO_FPDirectMove(Op, DAG, dl);
8763
8764 assert((IsSigned || Subtarget.hasFPCVT()) &&
8765 "UINT_TO_FP is supported only with FPCVT");
8766
8767 if (Src.getValueType() == MVT::i64) {
8768 SDValue SINT = Src;
8769 // When converting to single-precision, we actually need to convert
8770 // to double-precision first and then round to single-precision.
8771 // To avoid double-rounding effects during that operation, we have
8772 // to prepare the input operand. Bits that might be truncated when
8773 // converting to double-precision are replaced by a bit that won't
8774 // be lost at this stage, but is below the single-precision rounding
8775 // position.
8776 //
8777 // However, if afn is in effect, accept double
8778 // rounding to avoid the extra overhead.
8779 // FIXME: Currently INT_TO_FP can't support fast math flags because
8780 // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
8781 // false.
8782 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
8783 !Op->getFlags().hasApproximateFuncs()) {
8784
8785 // Twiddle input to make sure the low 11 bits are zero. (If this
8786 // is the case, we are guaranteed the value will fit into the 53 bit
8787 // mantissa of an IEEE double-precision value without rounding.)
8788 // If any of those low 11 bits were not zero originally, make sure
8789 // bit 12 (value 2048) is set instead, so that the final rounding
8790 // to single-precision gets the correct result.
8791 SDValue Round = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i64,
8792 N1: SINT, N2: DAG.getConstant(Val: 2047, DL: dl, VT: MVT::i64));
8793 Round = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i64,
8794 N1: Round, N2: DAG.getConstant(Val: 2047, DL: dl, VT: MVT::i64));
8795 Round = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MVT::i64, N1: Round, N2: SINT);
8796 Round = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i64, N1: Round,
8797 N2: DAG.getSignedConstant(Val: -2048, DL: dl, VT: MVT::i64));
8798
8799 // However, we cannot use that value unconditionally: if the magnitude
8800 // of the input value is small, the bit-twiddling we did above might
8801 // end up visibly changing the output. Fortunately, in that case, we
8802 // don't need to twiddle bits since the original input will convert
8803 // exactly to double-precision floating-point already. Therefore,
8804 // construct a conditional to use the original value if the top 11
8805 // bits are all sign-bit copies, and use the rounded value computed
8806 // above otherwise.
8807 SDValue Cond = DAG.getNode(Opcode: ISD::SRA, DL: dl, VT: MVT::i64,
8808 N1: SINT, N2: DAG.getConstant(Val: 53, DL: dl, VT: MVT::i32));
8809 Cond = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i64,
8810 N1: Cond, N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64));
8811 Cond = DAG.getSetCC(
8812 DL: dl,
8813 VT: getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
8814 LHS: Cond, RHS: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64), Cond: ISD::SETUGT);
8815
8816 SINT = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: MVT::i64, N1: Cond, N2: Round, N3: SINT);
8817 }
8818
8819 ReuseLoadInfo RLI;
8820 SDValue Bits;
8821
8822 MachineFunction &MF = DAG.getMachineFunction();
8823 if (canReuseLoadAddress(Op: SINT, MemVT: MVT::i64, RLI, DAG)) {
8824 Bits = DAG.getLoad(VT: MVT::f64, dl, Chain: RLI.Chain, Ptr: RLI.Ptr, PtrInfo: RLI.MPI,
8825 Alignment: RLI.Alignment, MMOFlags: RLI.MMOFlags(), AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8826 if (RLI.ResChain)
8827 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8828 } else if (Subtarget.hasLFIWAX() &&
8829 canReuseLoadAddress(Op: SINT, MemVT: MVT::i32, RLI, DAG, ET: ISD::SEXTLOAD)) {
8830 MachineMemOperand *MMO =
8831 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8832 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8833 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8834 Bits = DAG.getMemIntrinsicNode(Opcode: PPCISD::LFIWAX, dl,
8835 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8836 Ops, MemVT: MVT::i32, MMO);
8837 if (RLI.ResChain)
8838 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8839 } else if (Subtarget.hasFPCVT() &&
8840 canReuseLoadAddress(Op: SINT, MemVT: MVT::i32, RLI, DAG, ET: ISD::ZEXTLOAD)) {
8841 MachineMemOperand *MMO =
8842 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8843 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8844 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8845 Bits = DAG.getMemIntrinsicNode(Opcode: PPCISD::LFIWZX, dl,
8846 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8847 Ops, MemVT: MVT::i32, MMO);
8848 if (RLI.ResChain)
8849 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
8850 } else if (((Subtarget.hasLFIWAX() &&
8851 SINT.getOpcode() == ISD::SIGN_EXTEND) ||
8852 (Subtarget.hasFPCVT() &&
8853 SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
8854 SINT.getOperand(i: 0).getValueType() == MVT::i32) {
8855 MachineFrameInfo &MFI = MF.getFrameInfo();
8856 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8857
8858 int FrameIdx = MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false);
8859 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8860
8861 SDValue Store = DAG.getStore(Chain, dl, Val: SINT.getOperand(i: 0), Ptr: FIdx,
8862 PtrInfo: MachinePointerInfo::getFixedStack(
8863 MF&: DAG.getMachineFunction(), FI: FrameIdx));
8864 Chain = Store;
8865
8866 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8867 "Expected an i32 store");
8868
8869 RLI.Ptr = FIdx;
8870 RLI.Chain = Chain;
8871 RLI.MPI =
8872 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx);
8873 RLI.Alignment = Align(4);
8874
8875 MachineMemOperand *MMO =
8876 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8877 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8878 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8879 Bits = DAG.getMemIntrinsicNode(Opcode: SINT.getOpcode() == ISD::ZERO_EXTEND ?
8880 PPCISD::LFIWZX : PPCISD::LFIWAX,
8881 dl, VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
8882 Ops, MemVT: MVT::i32, MMO);
8883 Chain = Bits.getValue(R: 1);
8884 } else
8885 Bits = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::f64, Operand: SINT);
8886
8887 SDValue FP = convertIntToFP(Op, Src: Bits, DAG, Subtarget, Chain);
8888 if (IsStrict)
8889 Chain = FP.getValue(R: 1);
8890
8891 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8892 if (IsStrict)
8893 FP = DAG.getNode(
8894 Opcode: ISD::STRICT_FP_ROUND, DL: dl, VTList: DAG.getVTList(VT1: MVT::f32, VT2: MVT::Other),
8895 Ops: {Chain, FP, DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true)},
8896 Flags);
8897 else
8898 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
8899 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
8900 }
8901 return FP;
8902 }
8903
8904 assert(Src.getValueType() == MVT::i32 &&
8905 "Unhandled INT_TO_FP type in custom expander!");
8906 // Since we only generate this in 64-bit mode, we can take advantage of
8907 // 64-bit registers. In particular, sign extend the input value into the
8908 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
8909 // then lfd it and fcfid it.
8910 MachineFunction &MF = DAG.getMachineFunction();
8911 MachineFrameInfo &MFI = MF.getFrameInfo();
8912 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
8913
8914 SDValue Ld;
8915 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
8916 ReuseLoadInfo RLI;
8917 bool ReusingLoad;
8918 if (!(ReusingLoad = canReuseLoadAddress(Op: Src, MemVT: MVT::i32, RLI, DAG))) {
8919 int FrameIdx = MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false);
8920 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8921
8922 SDValue Store = DAG.getStore(Chain, dl, Val: Src, Ptr: FIdx,
8923 PtrInfo: MachinePointerInfo::getFixedStack(
8924 MF&: DAG.getMachineFunction(), FI: FrameIdx));
8925 Chain = Store;
8926
8927 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8928 "Expected an i32 store");
8929
8930 RLI.Ptr = FIdx;
8931 RLI.Chain = Chain;
8932 RLI.MPI =
8933 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx);
8934 RLI.Alignment = Align(4);
8935 }
8936
8937 MachineMemOperand *MMO =
8938 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
8939 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
8940 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8941 Ld = DAG.getMemIntrinsicNode(Opcode: IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
8942 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other), Ops,
8943 MemVT: MVT::i32, MMO);
8944 Chain = Ld.getValue(R: 1);
8945 if (ReusingLoad && RLI.ResChain) {
8946 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Ld.getValue(R: 1));
8947 }
8948 } else {
8949 assert(Subtarget.isPPC64() &&
8950 "i32->FP without LFIWAX supported only on PPC64");
8951
8952 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
8953 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
8954
8955 SDValue Ext64 = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: MVT::i64, Operand: Src);
8956
8957 // STD the extended value into the stack slot.
8958 SDValue Store = DAG.getStore(
8959 Chain, dl, Val: Ext64, Ptr: FIdx,
8960 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx));
8961 Chain = Store;
8962
8963 // Load the value as a double.
8964 Ld = DAG.getLoad(
8965 VT: MVT::f64, dl, Chain, Ptr: FIdx,
8966 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: FrameIdx));
8967 Chain = Ld.getValue(R: 1);
8968 }
8969
8970 // FCFID it and return it.
8971 SDValue FP = convertIntToFP(Op, Src: Ld, DAG, Subtarget, Chain);
8972 if (IsStrict)
8973 Chain = FP.getValue(R: 1);
8974 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8975 if (IsStrict)
8976 FP = DAG.getNode(
8977 Opcode: ISD::STRICT_FP_ROUND, DL: dl, VTList: DAG.getVTList(VT1: MVT::f32, VT2: MVT::Other),
8978 Ops: {Chain, FP, DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true)}, Flags);
8979 else
8980 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
8981 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
8982 }
8983 return FP;
8984}
8985
8986SDValue PPCTargetLowering::LowerSET_ROUNDING(SDValue Op,
8987 SelectionDAG &DAG) const {
8988 SDLoc Dl(Op);
8989 MachineFunction &MF = DAG.getMachineFunction();
8990 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
8991 SDValue Chain = Op.getOperand(i: 0);
8992
8993 // If requested mode is constant, just use simpler mtfsb/mffscrni
8994 if (auto *CVal = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
8995 uint64_t Mode = CVal->getZExtValue();
8996 assert(Mode < 4 && "Unsupported rounding mode!");
8997 unsigned InternalRnd = Mode ^ (~(Mode >> 1) & 1);
8998 if (Subtarget.isISA3_0())
8999 return SDValue(
9000 DAG.getMachineNode(
9001 Opcode: PPC::MFFSCRNI, dl: Dl, ResultTys: {MVT::f64, MVT::Other},
9002 Ops: {DAG.getConstant(Val: InternalRnd, DL: Dl, VT: MVT::i32, isTarget: true), Chain}),
9003 1);
9004 SDNode *SetHi = DAG.getMachineNode(
9005 Opcode: (InternalRnd & 2) ? PPC::MTFSB1 : PPC::MTFSB0, dl: Dl, VT: MVT::Other,
9006 Ops: {DAG.getConstant(Val: 30, DL: Dl, VT: MVT::i32, isTarget: true), Chain});
9007 SDNode *SetLo = DAG.getMachineNode(
9008 Opcode: (InternalRnd & 1) ? PPC::MTFSB1 : PPC::MTFSB0, dl: Dl, VT: MVT::Other,
9009 Ops: {DAG.getConstant(Val: 31, DL: Dl, VT: MVT::i32, isTarget: true), SDValue(SetHi, 0)});
9010 return SDValue(SetLo, 0);
9011 }
9012
9013 // Use x ^ (~(x >> 1) & 1) to transform LLVM rounding mode to Power format.
9014 SDValue One = DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32);
9015 SDValue SrcFlag = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32, N1: Op.getOperand(i: 1),
9016 N2: DAG.getConstant(Val: 3, DL: Dl, VT: MVT::i32));
9017 SDValue DstFlag = DAG.getNode(
9018 Opcode: ISD::XOR, DL: Dl, VT: MVT::i32, N1: SrcFlag,
9019 N2: DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32,
9020 N1: DAG.getNOT(DL: Dl,
9021 Val: DAG.getNode(Opcode: ISD::SRL, DL: Dl, VT: MVT::i32, N1: SrcFlag, N2: One),
9022 VT: MVT::i32),
9023 N2: One));
9024 // For Power9, there's faster mffscrn, and we don't need to read FPSCR
9025 SDValue MFFS;
9026 if (!Subtarget.isISA3_0()) {
9027 MFFS = DAG.getNode(Opcode: PPCISD::MFFS, DL: Dl, ResultTys: {MVT::f64, MVT::Other}, Ops: Chain);
9028 Chain = MFFS.getValue(R: 1);
9029 }
9030 SDValue NewFPSCR;
9031 if (Subtarget.isPPC64()) {
9032 if (Subtarget.isISA3_0()) {
9033 NewFPSCR = DAG.getAnyExtOrTrunc(Op: DstFlag, DL: Dl, VT: MVT::i64);
9034 } else {
9035 // Set the last two bits (rounding mode) of bitcasted FPSCR.
9036 SDNode *InsertRN = DAG.getMachineNode(
9037 Opcode: PPC::RLDIMI, dl: Dl, VT: MVT::i64,
9038 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL: Dl, VT: MVT::i64, Operand: MFFS),
9039 DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: Dl, VT: MVT::i64, Operand: DstFlag),
9040 DAG.getTargetConstant(Val: 0, DL: Dl, VT: MVT::i32),
9041 DAG.getTargetConstant(Val: 62, DL: Dl, VT: MVT::i32)});
9042 NewFPSCR = SDValue(InsertRN, 0);
9043 }
9044 NewFPSCR = DAG.getNode(Opcode: ISD::BITCAST, DL: Dl, VT: MVT::f64, Operand: NewFPSCR);
9045 } else {
9046 // In 32-bit mode, store f64, load and update the lower half.
9047 int SSFI = MF.getFrameInfo().CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9048 SDValue StackSlot = DAG.getFrameIndex(FI: SSFI, VT: PtrVT);
9049 SDValue Addr = Subtarget.isLittleEndian()
9050 ? StackSlot
9051 : DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: StackSlot,
9052 N2: DAG.getConstant(Val: 4, DL: Dl, VT: PtrVT));
9053 if (Subtarget.isISA3_0()) {
9054 Chain = DAG.getStore(Chain, dl: Dl, Val: DstFlag, Ptr: Addr, PtrInfo: MachinePointerInfo());
9055 } else {
9056 Chain = DAG.getStore(Chain, dl: Dl, Val: MFFS, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9057 SDValue Tmp =
9058 DAG.getLoad(VT: MVT::i32, dl: Dl, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo());
9059 Chain = Tmp.getValue(R: 1);
9060 Tmp = SDValue(DAG.getMachineNode(
9061 Opcode: PPC::RLWIMI, dl: Dl, VT: MVT::i32,
9062 Ops: {Tmp, DstFlag, DAG.getTargetConstant(Val: 0, DL: Dl, VT: MVT::i32),
9063 DAG.getTargetConstant(Val: 30, DL: Dl, VT: MVT::i32),
9064 DAG.getTargetConstant(Val: 31, DL: Dl, VT: MVT::i32)}),
9065 0);
9066 Chain = DAG.getStore(Chain, dl: Dl, Val: Tmp, Ptr: Addr, PtrInfo: MachinePointerInfo());
9067 }
9068 NewFPSCR =
9069 DAG.getLoad(VT: MVT::f64, dl: Dl, Chain, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9070 Chain = NewFPSCR.getValue(R: 1);
9071 }
9072 if (Subtarget.isISA3_0())
9073 return SDValue(DAG.getMachineNode(Opcode: PPC::MFFSCRN, dl: Dl, ResultTys: {MVT::f64, MVT::Other},
9074 Ops: {NewFPSCR, Chain}),
9075 1);
9076 SDValue Zero = DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i32, isTarget: true);
9077 SDNode *MTFSF = DAG.getMachineNode(
9078 Opcode: PPC::MTFSF, dl: Dl, VT: MVT::Other,
9079 Ops: {DAG.getConstant(Val: 255, DL: Dl, VT: MVT::i32, isTarget: true), NewFPSCR, Zero, Zero, Chain});
9080 return SDValue(MTFSF, 0);
9081}
9082
9083SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op,
9084 SelectionDAG &DAG) const {
9085 SDLoc dl(Op);
9086 /*
9087 The rounding mode is in bits 30:31 of FPSR, and has the following
9088 settings:
9089 00 Round to nearest
9090 01 Round to 0
9091 10 Round to +inf
9092 11 Round to -inf
9093
9094 GET_ROUNDING, on the other hand, expects the following:
9095 -1 Undefined
9096 0 Round to 0
9097 1 Round to nearest
9098 2 Round to +inf
9099 3 Round to -inf
9100
9101 To perform the conversion, we do:
9102 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
9103 */
9104
9105 MachineFunction &MF = DAG.getMachineFunction();
9106 EVT VT = Op.getValueType();
9107 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
9108
9109 // Save FP Control Word to register
9110 SDValue Chain = Op.getOperand(i: 0);
9111 SDValue MFFS = DAG.getNode(Opcode: PPCISD::MFFS, DL: dl, ResultTys: {MVT::f64, MVT::Other}, Ops: Chain);
9112 Chain = MFFS.getValue(R: 1);
9113
9114 SDValue CWD;
9115 if (isTypeLegal(VT: MVT::i64)) {
9116 CWD = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32,
9117 Operand: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: MFFS));
9118 } else {
9119 // Save FP register to stack slot
9120 int SSFI = MF.getFrameInfo().CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
9121 SDValue StackSlot = DAG.getFrameIndex(FI: SSFI, VT: PtrVT);
9122 Chain = DAG.getStore(Chain, dl, Val: MFFS, Ptr: StackSlot, PtrInfo: MachinePointerInfo());
9123
9124 // Load FP Control Word from low 32 bits of stack slot.
9125 assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
9126 "Stack slot adjustment is valid only on big endian subtargets!");
9127 SDValue Four = DAG.getConstant(Val: 4, DL: dl, VT: PtrVT);
9128 SDValue Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StackSlot, N2: Four);
9129 CWD = DAG.getLoad(VT: MVT::i32, dl, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo());
9130 Chain = CWD.getValue(R: 1);
9131 }
9132
9133 // Transform as necessary
9134 SDValue CWD1 =
9135 DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32,
9136 N1: CWD, N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32));
9137 SDValue CWD2 =
9138 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32,
9139 N1: DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32,
9140 N1: DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32,
9141 N1: CWD, N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32)),
9142 N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32)),
9143 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
9144
9145 SDValue RetVal =
9146 DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32, N1: CWD1, N2: CWD2);
9147
9148 RetVal =
9149 DAG.getNode(Opcode: (VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
9150 DL: dl, VT, Operand: RetVal);
9151
9152 return DAG.getMergeValues(Ops: {RetVal, Chain}, dl);
9153}
9154
9155SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9156 EVT VT = Op.getValueType();
9157 uint64_t BitWidth = VT.getSizeInBits();
9158 SDLoc dl(Op);
9159 assert(Op.getNumOperands() == 3 &&
9160 VT == Op.getOperand(1).getValueType() &&
9161 "Unexpected SHL!");
9162
9163 // Expand into a bunch of logical ops. Note that these ops
9164 // depend on the PPC behavior for oversized shift amounts.
9165 SDValue Lo = Op.getOperand(i: 0);
9166 SDValue Hi = Op.getOperand(i: 1);
9167 SDValue Amt = Op.getOperand(i: 2);
9168 EVT AmtVT = Amt.getValueType();
9169
9170 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9171 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9172 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Amt);
9173 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Tmp1);
9174 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR , DL: dl, VT, N1: Tmp2, N2: Tmp3);
9175 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9176 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9177 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Lo, N2: Tmp5);
9178 SDValue OutHi = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp4, N2: Tmp6);
9179 SDValue OutLo = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Lo, N2: Amt);
9180 SDValue OutOps[] = { OutLo, OutHi };
9181 return DAG.getMergeValues(Ops: OutOps, dl);
9182}
9183
9184SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9185 EVT VT = Op.getValueType();
9186 SDLoc dl(Op);
9187 uint64_t BitWidth = VT.getSizeInBits();
9188 assert(Op.getNumOperands() == 3 &&
9189 VT == Op.getOperand(1).getValueType() &&
9190 "Unexpected SRL!");
9191
9192 // Expand into a bunch of logical ops. Note that these ops
9193 // depend on the PPC behavior for oversized shift amounts.
9194 SDValue Lo = Op.getOperand(i: 0);
9195 SDValue Hi = Op.getOperand(i: 1);
9196 SDValue Amt = Op.getOperand(i: 2);
9197 EVT AmtVT = Amt.getValueType();
9198
9199 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9200 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9201 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Amt);
9202 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Tmp1);
9203 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp2, N2: Tmp3);
9204 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9205 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9206 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Hi, N2: Tmp5);
9207 SDValue OutLo = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp4, N2: Tmp6);
9208 SDValue OutHi = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Hi, N2: Amt);
9209 SDValue OutOps[] = { OutLo, OutHi };
9210 return DAG.getMergeValues(Ops: OutOps, dl);
9211}
9212
9213SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
9214 SDLoc dl(Op);
9215 EVT VT = Op.getValueType();
9216 uint64_t BitWidth = VT.getSizeInBits();
9217 assert(Op.getNumOperands() == 3 &&
9218 VT == Op.getOperand(1).getValueType() &&
9219 "Unexpected SRA!");
9220
9221 // Expand into a bunch of logical ops, followed by a select_cc.
9222 SDValue Lo = Op.getOperand(i: 0);
9223 SDValue Hi = Op.getOperand(i: 1);
9224 SDValue Amt = Op.getOperand(i: 2);
9225 EVT AmtVT = Amt.getValueType();
9226
9227 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT,
9228 N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Amt);
9229 SDValue Tmp2 = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Lo, N2: Amt);
9230 SDValue Tmp3 = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: Hi, N2: Tmp1);
9231 SDValue Tmp4 = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp2, N2: Tmp3);
9232 SDValue Tmp5 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AmtVT, N1: Amt,
9233 N2: DAG.getSignedConstant(Val: -BitWidth, DL: dl, VT: AmtVT));
9234 SDValue Tmp6 = DAG.getNode(Opcode: PPCISD::SRA, DL: dl, VT, N1: Hi, N2: Tmp5);
9235 SDValue OutHi = DAG.getNode(Opcode: PPCISD::SRA, DL: dl, VT, N1: Hi, N2: Amt);
9236 SDValue OutLo = DAG.getSelectCC(DL: dl, LHS: Tmp5, RHS: DAG.getConstant(Val: 0, DL: dl, VT: AmtVT),
9237 True: Tmp4, False: Tmp6, Cond: ISD::SETLE);
9238 SDValue OutOps[] = { OutLo, OutHi };
9239 return DAG.getMergeValues(Ops: OutOps, dl);
9240}
9241
9242SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
9243 SelectionDAG &DAG) const {
9244 SDLoc dl(Op);
9245 EVT VT = Op.getValueType();
9246 unsigned BitWidth = VT.getSizeInBits();
9247
9248 bool IsFSHL = Op.getOpcode() == ISD::FSHL;
9249 SDValue X = Op.getOperand(i: 0);
9250 SDValue Y = Op.getOperand(i: 1);
9251 SDValue Z = Op.getOperand(i: 2);
9252 EVT AmtVT = Z.getValueType();
9253
9254 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
9255 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
9256 // This is simpler than TargetLowering::expandFunnelShift because we can rely
9257 // on PowerPC shift by BW being well defined.
9258 Z = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: AmtVT, N1: Z,
9259 N2: DAG.getConstant(Val: BitWidth - 1, DL: dl, VT: AmtVT));
9260 SDValue SubZ =
9261 DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: AmtVT, N1: DAG.getConstant(Val: BitWidth, DL: dl, VT: AmtVT), N2: Z);
9262 X = DAG.getNode(Opcode: PPCISD::SHL, DL: dl, VT, N1: X, N2: IsFSHL ? Z : SubZ);
9263 Y = DAG.getNode(Opcode: PPCISD::SRL, DL: dl, VT, N1: Y, N2: IsFSHL ? SubZ : Z);
9264 return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: X, N2: Y);
9265}
9266
9267//===----------------------------------------------------------------------===//
9268// Vector related lowering.
9269//
9270
9271/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
9272/// element size of SplatSize. Cast the result to VT.
9273static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
9274 SelectionDAG &DAG, const SDLoc &dl) {
9275 static const MVT VTys[] = { // canonical VT to use for each size.
9276 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
9277 };
9278
9279 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
9280
9281 // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
9282 if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
9283 SplatSize = 1;
9284 Val = 0xFF;
9285 }
9286
9287 EVT CanonicalVT = VTys[SplatSize-1];
9288
9289 // Build a canonical splat for this value.
9290 // Explicitly truncate APInt here, as this API is used with a mix of
9291 // signed and unsigned values.
9292 return DAG.getBitcast(
9293 VT: ReqVT,
9294 V: DAG.getConstant(Val: APInt(64, Val).trunc(width: SplatSize * 8), DL: dl, VT: CanonicalVT));
9295}
9296
9297/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
9298/// specified intrinsic ID.
9299static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
9300 const SDLoc &dl, EVT DestVT = MVT::Other) {
9301 if (DestVT == MVT::Other) DestVT = Op.getValueType();
9302 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9303 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: Op);
9304}
9305
9306/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
9307/// specified intrinsic ID.
9308static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
9309 SelectionDAG &DAG, const SDLoc &dl,
9310 EVT DestVT = MVT::Other) {
9311 if (DestVT == MVT::Other) DestVT = LHS.getValueType();
9312 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9313 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: LHS, N3: RHS);
9314}
9315
9316/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
9317/// specified intrinsic ID.
9318static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
9319 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
9320 EVT DestVT = MVT::Other) {
9321 if (DestVT == MVT::Other) DestVT = Op0.getValueType();
9322 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: DestVT,
9323 N1: DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32), N2: Op0, N3: Op1, N4: Op2);
9324}
9325
9326/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
9327/// amount. The result has the specified value type.
9328static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
9329 SelectionDAG &DAG, const SDLoc &dl) {
9330 // Force LHS/RHS to be the right type.
9331 LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: LHS);
9332 RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: RHS);
9333
9334 int Ops[16];
9335 for (unsigned i = 0; i != 16; ++i)
9336 Ops[i] = i + Amt;
9337 SDValue T = DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: LHS, N2: RHS, Mask: Ops);
9338 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: T);
9339}
9340
9341/// Do we have an efficient pattern in a .td file for this node?
9342///
9343/// \param V - pointer to the BuildVectorSDNode being matched
9344/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
9345///
9346/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
9347/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
9348/// the opposite is true (expansion is beneficial) are:
9349/// - The node builds a vector out of integers that are not 32 or 64-bits
9350/// - The node builds a vector out of constants
9351/// - The node is a "load-and-splat"
9352/// In all other cases, we will choose to keep the BUILD_VECTOR.
9353static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
9354 bool HasDirectMove,
9355 bool HasP8Vector) {
9356 EVT VecVT = V->getValueType(ResNo: 0);
9357 bool RightType = VecVT == MVT::v2f64 ||
9358 (HasP8Vector && VecVT == MVT::v4f32) ||
9359 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
9360 if (!RightType)
9361 return false;
9362
9363 bool IsSplat = true;
9364 bool IsLoad = false;
9365 SDValue Op0 = V->getOperand(Num: 0);
9366
9367 // This function is called in a block that confirms the node is not a constant
9368 // splat. So a constant BUILD_VECTOR here means the vector is built out of
9369 // different constants.
9370 if (V->isConstant())
9371 return false;
9372 for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
9373 if (V->getOperand(Num: i).isUndef())
9374 return false;
9375 // We want to expand nodes that represent load-and-splat even if the
9376 // loaded value is a floating point truncation or conversion to int.
9377 if (V->getOperand(Num: i).getOpcode() == ISD::LOAD ||
9378 (V->getOperand(Num: i).getOpcode() == ISD::FP_ROUND &&
9379 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD) ||
9380 (V->getOperand(Num: i).getOpcode() == ISD::FP_TO_SINT &&
9381 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD) ||
9382 (V->getOperand(Num: i).getOpcode() == ISD::FP_TO_UINT &&
9383 V->getOperand(Num: i).getOperand(i: 0).getOpcode() == ISD::LOAD))
9384 IsLoad = true;
9385 // If the operands are different or the input is not a load and has more
9386 // uses than just this BV node, then it isn't a splat.
9387 if (V->getOperand(Num: i) != Op0 ||
9388 (!IsLoad && !V->isOnlyUserOf(N: V->getOperand(Num: i).getNode())))
9389 IsSplat = false;
9390 }
9391 return !(IsSplat && IsLoad);
9392}
9393
9394// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
9395SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
9396
9397 SDLoc dl(Op);
9398 SDValue Op0 = Op->getOperand(Num: 0);
9399
9400 if (!Subtarget.isPPC64() || (Op0.getOpcode() != ISD::BUILD_PAIR) ||
9401 (Op.getValueType() != MVT::f128))
9402 return SDValue();
9403
9404 SDValue Lo = Op0.getOperand(i: 0);
9405 SDValue Hi = Op0.getOperand(i: 1);
9406 if ((Lo.getValueType() != MVT::i64) || (Hi.getValueType() != MVT::i64))
9407 return SDValue();
9408
9409 if (!Subtarget.isLittleEndian())
9410 std::swap(a&: Lo, b&: Hi);
9411
9412 return DAG.getNode(Opcode: PPCISD::BUILD_FP128, DL: dl, VT: MVT::f128, N1: Lo, N2: Hi);
9413}
9414
9415static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
9416 const SDValue *InputLoad = &Op;
9417 while (InputLoad->getOpcode() == ISD::BITCAST)
9418 InputLoad = &InputLoad->getOperand(i: 0);
9419 if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9420 InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9421 IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
9422 InputLoad = &InputLoad->getOperand(i: 0);
9423 }
9424 if (InputLoad->getOpcode() != ISD::LOAD)
9425 return nullptr;
9426 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
9427 return ISD::isNormalLoad(N: LD) ? InputLoad : nullptr;
9428}
9429
9430// Convert the argument APFloat to a single precision APFloat if there is no
9431// loss in information during the conversion to single precision APFloat and the
9432// resulting number is not a denormal number. Return true if successful.
9433bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
9434 APFloat APFloatToConvert = ArgAPFloat;
9435 bool LosesInfo = true;
9436 APFloatToConvert.convert(ToSemantics: APFloat::IEEEsingle(), RM: APFloat::rmNearestTiesToEven,
9437 losesInfo: &LosesInfo);
9438 bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
9439 if (Success)
9440 ArgAPFloat = APFloatToConvert;
9441 return Success;
9442}
9443
9444// Bitcast the argument APInt to a double and convert it to a single precision
9445// APFloat, bitcast the APFloat to an APInt and assign it to the original
9446// argument if there is no loss in information during the conversion from
9447// double to single precision APFloat and the resulting number is not a denormal
9448// number. Return true if successful.
9449bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
9450 double DpValue = ArgAPInt.bitsToDouble();
9451 APFloat APFloatDp(DpValue);
9452 bool Success = convertToNonDenormSingle(ArgAPFloat&: APFloatDp);
9453 if (Success)
9454 ArgAPInt = APFloatDp.bitcastToAPInt();
9455 return Success;
9456}
9457
9458// Nondestructive check for convertTonNonDenormSingle.
9459bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
9460 // Only convert if it loses info, since XXSPLTIDP should
9461 // handle the other case.
9462 APFloat APFloatToConvert = ArgAPFloat;
9463 bool LosesInfo = true;
9464 APFloatToConvert.convert(ToSemantics: APFloat::IEEEsingle(), RM: APFloat::rmNearestTiesToEven,
9465 losesInfo: &LosesInfo);
9466
9467 return (!LosesInfo && !APFloatToConvert.isDenormal());
9468}
9469
9470static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
9471 unsigned &Opcode) {
9472 LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Val: Op.getOperand(i: 0));
9473 if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(N: InputNode))
9474 return false;
9475
9476 EVT Ty = Op->getValueType(ResNo: 0);
9477 // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
9478 // as we cannot handle extending loads for these types.
9479 if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
9480 ISD::isNON_EXTLoad(N: InputNode))
9481 return true;
9482
9483 EVT MemVT = InputNode->getMemoryVT();
9484 // For v8i16 and v16i8 types, extending loads can be handled as long as the
9485 // memory VT is the same vector element VT type.
9486 // The loads feeding into the v8i16 and v16i8 types will be extending because
9487 // scalar i8/i16 are not legal types.
9488 if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(N: InputNode) &&
9489 (MemVT == Ty.getVectorElementType()))
9490 return true;
9491
9492 if (Ty == MVT::v2i64) {
9493 // Check the extend type, when the input type is i32, and the output vector
9494 // type is v2i64.
9495 if (MemVT == MVT::i32) {
9496 if (ISD::isZEXTLoad(N: InputNode))
9497 Opcode = PPCISD::ZEXT_LD_SPLAT;
9498 if (ISD::isSEXTLoad(N: InputNode))
9499 Opcode = PPCISD::SEXT_LD_SPLAT;
9500 }
9501 return true;
9502 }
9503 return false;
9504}
9505
9506bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN,
9507 bool IsLittleEndian) {
9508 assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector");
9509
9510 BitMask.clearAllBits();
9511 EVT VT = BVN.getValueType(ResNo: 0);
9512 unsigned VTSize = VT.getSizeInBits();
9513 APInt ConstValue(VTSize, 0);
9514
9515 unsigned EltWidth = VT.getScalarSizeInBits();
9516
9517 unsigned BitPos = 0;
9518 for (auto OpVal : BVN.op_values()) {
9519 auto *CN = dyn_cast<ConstantSDNode>(Val&: OpVal);
9520
9521 if (!CN)
9522 return false;
9523 // The elements in a vector register are ordered in reverse byte order
9524 // between little-endian and big-endian modes.
9525 ConstValue.insertBits(SubBits: CN->getAPIntValue().zextOrTrunc(width: EltWidth),
9526 bitPosition: IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos);
9527 BitPos += EltWidth;
9528 }
9529
9530 for (unsigned J = 0; J < 16; ++J) {
9531 APInt ExtractValue = ConstValue.extractBits(numBits: 8, bitPosition: J * 8);
9532 if (ExtractValue != 0x00 && ExtractValue != 0xFF)
9533 return false;
9534 if (ExtractValue == 0xFF)
9535 BitMask.setBit(J);
9536 }
9537 return true;
9538}
9539
9540// If this is a case we can't handle, return null and let the default
9541// expansion code take care of it. If we CAN select this case, and if it
9542// selects to a single instruction, return Op. Otherwise, if we can codegen
9543// this case more efficiently than a constant pool load, lower it to the
9544// sequence of ops that should be used.
9545SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
9546 SelectionDAG &DAG) const {
9547 SDLoc dl(Op);
9548 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
9549 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
9550
9551 if (Subtarget.hasP10Vector()) {
9552 APInt BitMask(32, 0);
9553 // If the value of the vector is all zeros or all ones,
9554 // we do not convert it to MTVSRBMI.
9555 // The xxleqv instruction sets a vector with all ones.
9556 // The xxlxor instruction sets a vector with all zeros.
9557 if (isValidMtVsrBmi(BitMask, BVN&: *BVN, IsLittleEndian: Subtarget.isLittleEndian()) &&
9558 BitMask != 0 && BitMask != 0xffff) {
9559 SDValue SDConstant = DAG.getTargetConstant(Val: BitMask, DL: dl, VT: MVT::i32);
9560 MachineSDNode *MSDNode =
9561 DAG.getMachineNode(Opcode: PPC::MTVSRBMI, dl, VT: MVT::v16i8, Op1: SDConstant);
9562 SDValue SDV = SDValue(MSDNode, 0);
9563 EVT DVT = BVN->getValueType(ResNo: 0);
9564 EVT SVT = SDV.getValueType();
9565 if (SVT != DVT) {
9566 SDV = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: DVT, Operand: SDV);
9567 }
9568 return SDV;
9569 }
9570 // Recognize build vector patterns to emit VSX vector instructions
9571 // instead of loading value from memory.
9572 if (SDValue VecPat = combineBVLoadsSpecialValue(Operand: Op, DAG))
9573 return VecPat;
9574 }
9575 // Check if this is a splat of a constant value.
9576 APInt APSplatBits, APSplatUndef;
9577 unsigned SplatBitSize;
9578 bool HasAnyUndefs;
9579 bool BVNIsConstantSplat =
9580 BVN->isConstantSplat(SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize,
9581 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian());
9582
9583 // If it is a splat of a double, check if we can shrink it to a 32 bit
9584 // non-denormal float which when converted back to double gives us the same
9585 // double. This is to exploit the XXSPLTIDP instruction.
9586 // If we lose precision, we use XXSPLTI32DX.
9587 if (BVNIsConstantSplat && (SplatBitSize == 64) &&
9588 Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
9589 // Check the type first to short-circuit so we don't modify APSplatBits if
9590 // this block isn't executed.
9591 if ((Op->getValueType(ResNo: 0) == MVT::v2f64) &&
9592 convertToNonDenormSingle(ArgAPInt&: APSplatBits)) {
9593 SDValue SplatNode = DAG.getNode(
9594 Opcode: PPCISD::XXSPLTI_SP_TO_DP, DL: dl, VT: MVT::v2f64,
9595 Operand: DAG.getTargetConstant(Val: APSplatBits.getZExtValue(), DL: dl, VT: MVT::i32));
9596 return DAG.getBitcast(VT: Op.getValueType(), V: SplatNode);
9597 } else {
9598 // We may lose precision, so we have to use XXSPLTI32DX.
9599
9600 uint32_t Hi = Hi_32(Value: APSplatBits.getZExtValue());
9601 uint32_t Lo = Lo_32(Value: APSplatBits.getZExtValue());
9602 SDValue SplatNode = DAG.getUNDEF(VT: MVT::v2i64);
9603
9604 if (!Hi || !Lo)
9605 // If either load is 0, then we should generate XXLXOR to set to 0.
9606 SplatNode = DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::v2i64);
9607
9608 if (Hi)
9609 SplatNode = DAG.getNode(
9610 Opcode: PPCISD::XXSPLTI32DX, DL: dl, VT: MVT::v2i64, N1: SplatNode,
9611 N2: DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::i32),
9612 N3: DAG.getTargetConstant(Val: Hi, DL: dl, VT: MVT::i32));
9613
9614 if (Lo)
9615 SplatNode =
9616 DAG.getNode(Opcode: PPCISD::XXSPLTI32DX, DL: dl, VT: MVT::v2i64, N1: SplatNode,
9617 N2: DAG.getTargetConstant(Val: 1, DL: dl, VT: MVT::i32),
9618 N3: DAG.getTargetConstant(Val: Lo, DL: dl, VT: MVT::i32));
9619
9620 return DAG.getBitcast(VT: Op.getValueType(), V: SplatNode);
9621 }
9622 }
9623
9624 bool IsSplat64 = false;
9625 uint64_t SplatBits = 0;
9626 int32_t SextVal = 0;
9627 if (BVNIsConstantSplat && SplatBitSize <= 64) {
9628 SplatBits = APSplatBits.getZExtValue();
9629 if (SplatBitSize <= 32) {
9630 SextVal = SignExtend32(X: SplatBits, B: SplatBitSize);
9631 } else if (SplatBitSize == 64 && Subtarget.hasP8Altivec()) {
9632 int64_t Splat64Val = static_cast<int64_t>(SplatBits);
9633 bool P9Vector = Subtarget.hasP9Vector();
9634 int32_t Hi = P9Vector ? 127 : 15;
9635 int32_t Lo = P9Vector ? -128 : -16;
9636 IsSplat64 = Splat64Val >= Lo && Splat64Val <= Hi;
9637 SextVal = static_cast<int32_t>(SplatBits);
9638 }
9639 }
9640
9641 if (!BVNIsConstantSplat || (SplatBitSize > 32 && !IsSplat64)) {
9642 unsigned NewOpcode = PPCISD::LD_SPLAT;
9643
9644 // Handle load-and-splat patterns as we have instructions that will do this
9645 // in one go.
9646 if (DAG.isSplatValue(V: Op, AllowUndefs: true) &&
9647 isValidSplatLoad(Subtarget, Op, Opcode&: NewOpcode)) {
9648 const SDValue *InputLoad = &Op.getOperand(i: 0);
9649 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
9650
9651 // If the input load is an extending load, it will be an i32 -> i64
9652 // extending load and isValidSplatLoad() will update NewOpcode.
9653 unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();
9654 unsigned ElementSize =
9655 MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
9656
9657 assert(((ElementSize == 2 * MemorySize)
9658 ? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||
9659 NewOpcode == PPCISD::SEXT_LD_SPLAT)
9660 : (NewOpcode == PPCISD::LD_SPLAT)) &&
9661 "Unmatched element size and opcode!\n");
9662
9663 // Checking for a single use of this load, we have to check for vector
9664 // width (128 bits) / ElementSize uses (since each operand of the
9665 // BUILD_VECTOR is a separate use of the value.
9666 unsigned NumUsesOfInputLD = 128 / ElementSize;
9667 for (SDValue BVInOp : Op->ops())
9668 if (BVInOp.isUndef())
9669 NumUsesOfInputLD--;
9670
9671 // Exclude somes case where LD_SPLAT is worse than scalar_to_vector:
9672 // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
9673 // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
9674 // 15", but function IsValidSplatLoad() now will only return true when
9675 // the data at index 0 is not nullptr. So we will not get into trouble for
9676 // these cases.
9677 //
9678 // case 1 - lfiwzx/lfiwax
9679 // 1.1: load result is i32 and is sign/zero extend to i64;
9680 // 1.2: build a v2i64 vector type with above loaded value;
9681 // 1.3: the vector has only one value at index 0, others are all undef;
9682 // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
9683 if (NumUsesOfInputLD == 1 &&
9684 (Op->getValueType(ResNo: 0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
9685 !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
9686 Subtarget.hasLFIWAX()))
9687 return SDValue();
9688
9689 // case 2 - lxvr[hb]x
9690 // 2.1: load result is at most i16;
9691 // 2.2: build a vector with above loaded value;
9692 // 2.3: the vector has only one value at index 0, others are all undef;
9693 // 2.4: on LE target, so that lxvr[hb]x does not need any permute.
9694 if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
9695 Subtarget.isISA3_1() && ElementSize <= 16)
9696 return SDValue();
9697
9698 assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
9699 if (InputLoad->getNode()->hasNUsesOfValue(NUses: NumUsesOfInputLD, Value: 0) &&
9700 Subtarget.hasVSX()) {
9701 SDValue Ops[] = {
9702 LD->getChain(), // Chain
9703 LD->getBasePtr(), // Ptr
9704 DAG.getValueType(Op.getValueType()) // VT
9705 };
9706 SDValue LdSplt = DAG.getMemIntrinsicNode(
9707 Opcode: NewOpcode, dl, VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::Other), Ops,
9708 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
9709 // Replace all uses of the output chain of the original load with the
9710 // output chain of the new load.
9711 DAG.ReplaceAllUsesOfValueWith(From: InputLoad->getValue(R: 1),
9712 To: LdSplt.getValue(R: 1));
9713 return LdSplt;
9714 }
9715 }
9716
9717 // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
9718 // 32-bits can be lowered to VSX instructions under certain conditions.
9719 // Without VSX, there is no pattern more efficient than expanding the node.
9720 if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
9721 haveEfficientBuildVectorPattern(V: BVN, HasDirectMove: Subtarget.hasDirectMove(),
9722 HasP8Vector: Subtarget.hasP8Vector()))
9723 return Op;
9724 return SDValue();
9725 }
9726
9727 uint64_t SplatUndef = APSplatUndef.getZExtValue();
9728 unsigned SplatSize = SplatBitSize / 8;
9729
9730 // First, handle single instruction cases.
9731
9732 // All zeros?
9733 if (SplatBits == 0) {
9734 // Canonicalize all zero vectors to be v4i32.
9735 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
9736 SDValue Z = DAG.getConstant(Val: 0, DL: dl, VT: MVT::v4i32);
9737 Op = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Z);
9738 }
9739 return Op;
9740 }
9741
9742 // We have XXSPLTIW for constant splats four bytes wide.
9743 // Given vector length is a multiple of 4, 2-byte splats can be replaced
9744 // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
9745 // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
9746 // turned into a 4-byte splat of 0xABABABAB.
9747 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)
9748 return getCanonicalConstSplat(Val: SplatBits | (SplatBits << 16), SplatSize: SplatSize * 2,
9749 VT: Op.getValueType(), DAG, dl);
9750
9751 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)
9752 return getCanonicalConstSplat(Val: SplatBits, SplatSize, VT: Op.getValueType(), DAG,
9753 dl);
9754
9755 // We have XXSPLTIB for constant splats one byte wide.
9756 if (Subtarget.hasP9Vector() && SplatSize == 1)
9757 return getCanonicalConstSplat(Val: SplatBits, SplatSize, VT: Op.getValueType(), DAG,
9758 dl);
9759
9760 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
9761 // Use VSPLTIW/VUPKLSW for v2i64 in range [-16,15].
9762 if (SextVal >= -16 && SextVal <= 15) {
9763 // SplatSize may be 1, 2, 4, or 8. Use size 4 instead of 8 for the splat to
9764 // generate a splat word with extend for size 8.
9765 unsigned UseSize = SplatSize == 8 ? 4 : SplatSize;
9766 SDValue Res =
9767 getCanonicalConstSplat(Val: SextVal, SplatSize: UseSize, VT: Op.getValueType(), DAG, dl);
9768 if (SplatSize != 8)
9769 return Res;
9770 return BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vupklsw, Op: Res, DAG, dl);
9771 }
9772
9773 // Two instruction sequences.
9774
9775 if (Subtarget.hasP9Vector() && SextVal >= -128 && SextVal <= 127) {
9776 SDValue C = DAG.getConstant(Val: (unsigned char)SextVal, DL: dl, VT: MVT::i32);
9777 SmallVector<SDValue, 16> Ops(16, C);
9778 SDValue BV = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops);
9779 unsigned IID;
9780 EVT VT;
9781 switch (SplatSize) {
9782 default:
9783 llvm_unreachable("Unexpected type for vector constant.");
9784 case 2:
9785 IID = Intrinsic::ppc_altivec_vupklsb;
9786 VT = MVT::v8i16;
9787 break;
9788 case 4:
9789 IID = Intrinsic::ppc_altivec_vextsb2w;
9790 VT = MVT::v4i32;
9791 break;
9792 case 8:
9793 IID = Intrinsic::ppc_altivec_vextsb2d;
9794 VT = MVT::v2i64;
9795 break;
9796 }
9797 SDValue Extend = BuildIntrinsicOp(IID, Op: BV, DAG, dl, DestVT: VT);
9798 return DAG.getBitcast(VT: Op->getValueType(ResNo: 0), V: Extend);
9799 }
9800 assert(!IsSplat64 && "Unhandled 64-bit splat pattern");
9801
9802 // If this value is in the range [-32,30] and is even, use:
9803 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
9804 // If this value is in the range [17,31] and is odd, use:
9805 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
9806 // If this value is in the range [-31,-17] and is odd, use:
9807 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
9808 // Note the last two are three-instruction sequences.
9809 if (SextVal >= -32 && SextVal <= 31) {
9810 // To avoid having these optimizations undone by constant folding,
9811 // we convert to a pseudo that will be expanded later into one of
9812 // the above forms.
9813 SDValue Elt = DAG.getSignedConstant(Val: SextVal, DL: dl, VT: MVT::i32);
9814 EVT VT = (SplatSize == 1 ? MVT::v16i8 :
9815 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
9816 SDValue EltSize = DAG.getConstant(Val: SplatSize, DL: dl, VT: MVT::i32);
9817 SDValue RetVal = DAG.getNode(Opcode: PPCISD::VADD_SPLAT, DL: dl, VT, N1: Elt, N2: EltSize);
9818 if (VT == Op.getValueType())
9819 return RetVal;
9820 else
9821 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: RetVal);
9822 }
9823
9824 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
9825 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
9826 // for fneg/fabs.
9827 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
9828 // Make -1 and vspltisw -1:
9829 SDValue OnesV = getCanonicalConstSplat(Val: -1, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
9830
9831 // Make the VSLW intrinsic, computing 0x8000_0000.
9832 SDValue Res = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vslw, LHS: OnesV,
9833 RHS: OnesV, DAG, dl);
9834
9835 // xor by OnesV to invert it.
9836 Res = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::v4i32, N1: Res, N2: OnesV);
9837 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9838 }
9839
9840 // Check to see if this is a wide variety of vsplti*, binop self cases.
9841 static const signed char SplatCsts[] = {
9842 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
9843 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
9844 };
9845
9846 for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) {
9847 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
9848 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
9849 int i = SplatCsts[idx];
9850
9851 // Figure out what shift amount will be used by altivec if shifted by i in
9852 // this splat size.
9853 unsigned TypeShiftAmt = i & (SplatBitSize-1);
9854
9855 // vsplti + shl self.
9856 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
9857 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9858 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9859 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
9860 Intrinsic::ppc_altivec_vslw
9861 };
9862 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9863 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9864 }
9865
9866 // vsplti + srl self.
9867 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9868 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9869 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9870 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
9871 Intrinsic::ppc_altivec_vsrw
9872 };
9873 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9874 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9875 }
9876
9877 // vsplti + rol self.
9878 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
9879 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
9880 SDValue Res = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::Other, DAG, dl);
9881 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9882 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
9883 Intrinsic::ppc_altivec_vrlw
9884 };
9885 Res = BuildIntrinsicOp(IID: IIDs[SplatSize-1], LHS: Res, RHS: Res, DAG, dl);
9886 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Res);
9887 }
9888
9889 // t = vsplti c, result = vsldoi t, t, 1
9890 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
9891 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9892 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
9893 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9894 }
9895 // t = vsplti c, result = vsldoi t, t, 2
9896 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
9897 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9898 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
9899 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9900 }
9901 // t = vsplti c, result = vsldoi t, t, 3
9902 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
9903 SDValue T = getCanonicalConstSplat(Val: i, SplatSize, VT: MVT::v16i8, DAG, dl);
9904 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
9905 return BuildVSLDOI(LHS: T, RHS: T, Amt, VT: Op.getValueType(), DAG, dl);
9906 }
9907 }
9908
9909 return SDValue();
9910}
9911
9912/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9913/// the specified operations to build the shuffle.
9914static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9915 SDValue RHS, SelectionDAG &DAG,
9916 const SDLoc &dl) {
9917 unsigned OpNum = (PFEntry >> 26) & 0x0F;
9918 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
9919 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
9920
9921 enum {
9922 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9923 OP_VMRGHW,
9924 OP_VMRGLW,
9925 OP_VSPLTISW0,
9926 OP_VSPLTISW1,
9927 OP_VSPLTISW2,
9928 OP_VSPLTISW3,
9929 OP_VSLDOI4,
9930 OP_VSLDOI8,
9931 OP_VSLDOI12
9932 };
9933
9934 if (OpNum == OP_COPY) {
9935 if (LHSID == (1*9+2)*9+3) return LHS;
9936 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
9937 return RHS;
9938 }
9939
9940 SDValue OpLHS, OpRHS;
9941 OpLHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9942 OpRHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9943
9944 int ShufIdxs[16];
9945 switch (OpNum) {
9946 default: llvm_unreachable("Unknown i32 permute!");
9947 case OP_VMRGHW:
9948 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
9949 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
9950 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
9951 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
9952 break;
9953 case OP_VMRGLW:
9954 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
9955 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
9956 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
9957 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
9958 break;
9959 case OP_VSPLTISW0:
9960 for (unsigned i = 0; i != 16; ++i)
9961 ShufIdxs[i] = (i&3)+0;
9962 break;
9963 case OP_VSPLTISW1:
9964 for (unsigned i = 0; i != 16; ++i)
9965 ShufIdxs[i] = (i&3)+4;
9966 break;
9967 case OP_VSPLTISW2:
9968 for (unsigned i = 0; i != 16; ++i)
9969 ShufIdxs[i] = (i&3)+8;
9970 break;
9971 case OP_VSPLTISW3:
9972 for (unsigned i = 0; i != 16; ++i)
9973 ShufIdxs[i] = (i&3)+12;
9974 break;
9975 case OP_VSLDOI4:
9976 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 4, VT: OpLHS.getValueType(), DAG, dl);
9977 case OP_VSLDOI8:
9978 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 8, VT: OpLHS.getValueType(), DAG, dl);
9979 case OP_VSLDOI12:
9980 return BuildVSLDOI(LHS: OpLHS, RHS: OpRHS, Amt: 12, VT: OpLHS.getValueType(), DAG, dl);
9981 }
9982 EVT VT = OpLHS.getValueType();
9983 OpLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OpLHS);
9984 OpRHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OpRHS);
9985 SDValue T = DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: OpLHS, N2: OpRHS, Mask: ShufIdxs);
9986 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: T);
9987}
9988
9989/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
9990/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
9991/// SDValue.
9992SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
9993 SelectionDAG &DAG) const {
9994 const unsigned BytesInVector = 16;
9995 bool IsLE = Subtarget.isLittleEndian();
9996 SDLoc dl(N);
9997 SDValue V1 = N->getOperand(Num: 0);
9998 SDValue V2 = N->getOperand(Num: 1);
9999 unsigned ShiftElts = 0, InsertAtByte = 0;
10000 bool Swap = false;
10001
10002 // Shifts required to get the byte we want at element 7.
10003 unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
10004 0, 15, 14, 13, 12, 11, 10, 9};
10005 unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
10006 1, 2, 3, 4, 5, 6, 7, 8};
10007
10008 ArrayRef<int> Mask = N->getMask();
10009 int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
10010
10011 // For each mask element, find out if we're just inserting something
10012 // from V2 into V1 or vice versa.
10013 // Possible permutations inserting an element from V2 into V1:
10014 // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
10015 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
10016 // ...
10017 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
10018 // Inserting from V1 into V2 will be similar, except mask range will be
10019 // [16,31].
10020
10021 bool FoundCandidate = false;
10022 // If both vector operands for the shuffle are the same vector, the mask
10023 // will contain only elements from the first one and the second one will be
10024 // undef.
10025 unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
10026 // Go through the mask of half-words to find an element that's being moved
10027 // from one vector to the other.
10028 for (unsigned i = 0; i < BytesInVector; ++i) {
10029 unsigned CurrentElement = Mask[i];
10030 // If 2nd operand is undefined, we should only look for element 7 in the
10031 // Mask.
10032 if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
10033 continue;
10034
10035 bool OtherElementsInOrder = true;
10036 // Examine the other elements in the Mask to see if they're in original
10037 // order.
10038 for (unsigned j = 0; j < BytesInVector; ++j) {
10039 if (j == i)
10040 continue;
10041 // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
10042 // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
10043 // in which we always assume we're always picking from the 1st operand.
10044 int MaskOffset =
10045 (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
10046 if (Mask[j] != OriginalOrder[j] + MaskOffset) {
10047 OtherElementsInOrder = false;
10048 break;
10049 }
10050 }
10051 // If other elements are in original order, we record the number of shifts
10052 // we need to get the element we want into element 7. Also record which byte
10053 // in the vector we should insert into.
10054 if (OtherElementsInOrder) {
10055 // If 2nd operand is undefined, we assume no shifts and no swapping.
10056 if (V2.isUndef()) {
10057 ShiftElts = 0;
10058 Swap = false;
10059 } else {
10060 // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
10061 ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
10062 : BigEndianShifts[CurrentElement & 0xF];
10063 Swap = CurrentElement < BytesInVector;
10064 }
10065 InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
10066 FoundCandidate = true;
10067 break;
10068 }
10069 }
10070
10071 if (!FoundCandidate)
10072 return SDValue();
10073
10074 // Candidate found, construct the proper SDAG sequence with VINSERTB,
10075 // optionally with VECSHL if shift is required.
10076 if (Swap)
10077 std::swap(a&: V1, b&: V2);
10078 if (V2.isUndef())
10079 V2 = V1;
10080 if (ShiftElts) {
10081 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v16i8, N1: V2, N2: V2,
10082 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10083 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v16i8, N1: V1, N2: Shl,
10084 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10085 }
10086 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v16i8, N1: V1, N2: V2,
10087 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10088}
10089
10090/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
10091/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
10092/// SDValue.
10093SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
10094 SelectionDAG &DAG) const {
10095 const unsigned NumHalfWords = 8;
10096 const unsigned BytesInVector = NumHalfWords * 2;
10097 // Check that the shuffle is on half-words.
10098 if (!isNByteElemShuffleMask(N, Width: 2, StepLen: 1))
10099 return SDValue();
10100
10101 bool IsLE = Subtarget.isLittleEndian();
10102 SDLoc dl(N);
10103 SDValue V1 = N->getOperand(Num: 0);
10104 SDValue V2 = N->getOperand(Num: 1);
10105 unsigned ShiftElts = 0, InsertAtByte = 0;
10106 bool Swap = false;
10107
10108 // Shifts required to get the half-word we want at element 3.
10109 unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
10110 unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
10111
10112 uint32_t Mask = 0;
10113 uint32_t OriginalOrderLow = 0x1234567;
10114 uint32_t OriginalOrderHigh = 0x89ABCDEF;
10115 // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
10116 // 32-bit space, only need 4-bit nibbles per element.
10117 for (unsigned i = 0; i < NumHalfWords; ++i) {
10118 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
10119 Mask |= ((uint32_t)(N->getMaskElt(Idx: i * 2) / 2) << MaskShift);
10120 }
10121
10122 // For each mask element, find out if we're just inserting something
10123 // from V2 into V1 or vice versa. Possible permutations inserting an element
10124 // from V2 into V1:
10125 // X, 1, 2, 3, 4, 5, 6, 7
10126 // 0, X, 2, 3, 4, 5, 6, 7
10127 // 0, 1, X, 3, 4, 5, 6, 7
10128 // 0, 1, 2, X, 4, 5, 6, 7
10129 // 0, 1, 2, 3, X, 5, 6, 7
10130 // 0, 1, 2, 3, 4, X, 6, 7
10131 // 0, 1, 2, 3, 4, 5, X, 7
10132 // 0, 1, 2, 3, 4, 5, 6, X
10133 // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
10134
10135 bool FoundCandidate = false;
10136 // Go through the mask of half-words to find an element that's being moved
10137 // from one vector to the other.
10138 for (unsigned i = 0; i < NumHalfWords; ++i) {
10139 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
10140 uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
10141 uint32_t MaskOtherElts = ~(0xF << MaskShift);
10142 uint32_t TargetOrder = 0x0;
10143
10144 // If both vector operands for the shuffle are the same vector, the mask
10145 // will contain only elements from the first one and the second one will be
10146 // undef.
10147 if (V2.isUndef()) {
10148 ShiftElts = 0;
10149 unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
10150 TargetOrder = OriginalOrderLow;
10151 Swap = false;
10152 // Skip if not the correct element or mask of other elements don't equal
10153 // to our expected order.
10154 if (MaskOneElt == VINSERTHSrcElem &&
10155 (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10156 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10157 FoundCandidate = true;
10158 break;
10159 }
10160 } else { // If both operands are defined.
10161 // Target order is [8,15] if the current mask is between [0,7].
10162 TargetOrder =
10163 (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
10164 // Skip if mask of other elements don't equal our expected order.
10165 if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10166 // We only need the last 3 bits for the number of shifts.
10167 ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
10168 : BigEndianShifts[MaskOneElt & 0x7];
10169 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10170 Swap = MaskOneElt < NumHalfWords;
10171 FoundCandidate = true;
10172 break;
10173 }
10174 }
10175 }
10176
10177 if (!FoundCandidate)
10178 return SDValue();
10179
10180 // Candidate found, construct the proper SDAG sequence with VINSERTH,
10181 // optionally with VECSHL if shift is required.
10182 if (Swap)
10183 std::swap(a&: V1, b&: V2);
10184 if (V2.isUndef())
10185 V2 = V1;
10186 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V1);
10187 if (ShiftElts) {
10188 // Double ShiftElts because we're left shifting on v16i8 type.
10189 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v16i8, N1: V2, N2: V2,
10190 N3: DAG.getConstant(Val: 2 * ShiftElts, DL: dl, VT: MVT::i32));
10191 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: Shl);
10192 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v8i16, N1: Conv1, N2: Conv2,
10193 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10194 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10195 }
10196 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V2);
10197 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v8i16, N1: Conv1, N2: Conv2,
10198 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10199 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10200}
10201
10202/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
10203/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
10204/// return the default SDValue.
10205SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
10206 SelectionDAG &DAG) const {
10207 // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
10208 // to v16i8. Peek through the bitcasts to get the actual operands.
10209 SDValue LHS = peekThroughBitcasts(V: SVN->getOperand(Num: 0));
10210 SDValue RHS = peekThroughBitcasts(V: SVN->getOperand(Num: 1));
10211
10212 auto ShuffleMask = SVN->getMask();
10213 SDValue VecShuffle(SVN, 0);
10214 SDLoc DL(SVN);
10215
10216 // Check that we have a four byte shuffle.
10217 if (!isNByteElemShuffleMask(N: SVN, Width: 4, StepLen: 1))
10218 return SDValue();
10219
10220 // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
10221 if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
10222 std::swap(a&: LHS, b&: RHS);
10223 VecShuffle = peekThroughBitcasts(V: DAG.getCommutedVectorShuffle(SV: *SVN));
10224 ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(Val&: VecShuffle);
10225 if (!CommutedSV)
10226 return SDValue();
10227 ShuffleMask = CommutedSV->getMask();
10228 }
10229
10230 // Ensure that the RHS is a vector of constants.
10231 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
10232 if (!BVN)
10233 return SDValue();
10234
10235 // Check if RHS is a splat of 4-bytes (or smaller).
10236 APInt APSplatValue, APSplatUndef;
10237 unsigned SplatBitSize;
10238 bool HasAnyUndefs;
10239 if (!BVN->isConstantSplat(SplatValue&: APSplatValue, SplatUndef&: APSplatUndef, SplatBitSize,
10240 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian()) ||
10241 SplatBitSize > 32)
10242 return SDValue();
10243
10244 // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
10245 // The instruction splats a constant C into two words of the source vector
10246 // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
10247 // Thus we check that the shuffle mask is the equivalent of
10248 // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
10249 // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
10250 // within each word are consecutive, so we only need to check the first byte.
10251 SDValue Index;
10252 bool IsLE = Subtarget.isLittleEndian();
10253 if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
10254 (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
10255 ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
10256 Index = DAG.getTargetConstant(Val: IsLE ? 0 : 1, DL, VT: MVT::i32);
10257 else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
10258 (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
10259 ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
10260 Index = DAG.getTargetConstant(Val: IsLE ? 1 : 0, DL, VT: MVT::i32);
10261 else
10262 return SDValue();
10263
10264 // If the splat is narrower than 32-bits, we need to get the 32-bit value
10265 // for XXSPLTI32DX.
10266 unsigned SplatVal = APSplatValue.getZExtValue();
10267 for (; SplatBitSize < 32; SplatBitSize <<= 1)
10268 SplatVal |= (SplatVal << SplatBitSize);
10269
10270 SDValue SplatNode = DAG.getNode(
10271 Opcode: PPCISD::XXSPLTI32DX, DL, VT: MVT::v2i64, N1: DAG.getBitcast(VT: MVT::v2i64, V: LHS),
10272 N2: Index, N3: DAG.getTargetConstant(Val: SplatVal, DL, VT: MVT::i32));
10273 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: SplatNode);
10274}
10275
10276/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
10277/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
10278/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
10279/// i.e (or (shl x, C1), (srl x, 128-C1)).
10280SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
10281 assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
10282 assert(Op.getValueType() == MVT::v1i128 &&
10283 "Only set v1i128 as custom, other type shouldn't reach here!");
10284 SDLoc dl(Op);
10285 SDValue N0 = peekThroughBitcasts(V: Op.getOperand(i: 0));
10286 SDValue N1 = peekThroughBitcasts(V: Op.getOperand(i: 1));
10287 unsigned SHLAmt = N1.getConstantOperandVal(i: 0);
10288 if (SHLAmt % 8 == 0) {
10289 std::array<int, 16> Mask;
10290 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
10291 std::rotate(first: Mask.begin(), middle: Mask.begin() + SHLAmt / 8, last: Mask.end());
10292 if (SDValue Shuffle =
10293 DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: DAG.getBitcast(VT: MVT::v16i8, V: N0),
10294 N2: DAG.getUNDEF(VT: MVT::v16i8), Mask))
10295 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: Shuffle);
10296 }
10297 SDValue ArgVal = DAG.getBitcast(VT: MVT::i128, V: N0);
10298 SDValue SHLOp = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i128, N1: ArgVal,
10299 N2: DAG.getConstant(Val: SHLAmt, DL: dl, VT: MVT::i32));
10300 SDValue SRLOp = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i128, N1: ArgVal,
10301 N2: DAG.getConstant(Val: 128 - SHLAmt, DL: dl, VT: MVT::i32));
10302 SDValue OROp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MVT::i128, N1: SHLOp, N2: SRLOp);
10303 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: OROp);
10304}
10305
10306/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
10307/// is a shuffle we can handle in a single instruction, return it. Otherwise,
10308/// return the code it can be lowered into. Worst case, it can always be
10309/// lowered into a vperm.
10310SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
10311 SelectionDAG &DAG) const {
10312 SDLoc dl(Op);
10313 SDValue V1 = Op.getOperand(i: 0);
10314 SDValue V2 = Op.getOperand(i: 1);
10315 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Val&: Op);
10316
10317 // Any nodes that were combined in the target-independent combiner prior
10318 // to vector legalization will not be sent to the target combine. Try to
10319 // combine it here.
10320 if (SDValue NewShuffle = combineVectorShuffle(SVN: SVOp, DAG)) {
10321 if (!isa<ShuffleVectorSDNode>(Val: NewShuffle))
10322 return NewShuffle;
10323 Op = NewShuffle;
10324 SVOp = cast<ShuffleVectorSDNode>(Val&: Op);
10325 V1 = Op.getOperand(i: 0);
10326 V2 = Op.getOperand(i: 1);
10327 }
10328 EVT VT = Op.getValueType();
10329 bool isLittleEndian = Subtarget.isLittleEndian();
10330
10331 unsigned ShiftElts, InsertAtByte;
10332 bool Swap = false;
10333
10334 // If this is a load-and-splat, we can do that with a single instruction
10335 // in some cases. However if the load has multiple uses, we don't want to
10336 // combine it because that will just produce multiple loads.
10337 bool IsPermutedLoad = false;
10338 const SDValue *InputLoad = getNormalLoadInput(Op: V1, IsPermuted&: IsPermutedLoad);
10339 if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
10340 (PPC::isSplatShuffleMask(N: SVOp, EltSize: 4) || PPC::isSplatShuffleMask(N: SVOp, EltSize: 8)) &&
10341 InputLoad->hasOneUse()) {
10342 bool IsFourByte = PPC::isSplatShuffleMask(N: SVOp, EltSize: 4);
10343 int SplatIdx =
10344 PPC::getSplatIdxForPPCMnemonics(N: SVOp, EltSize: IsFourByte ? 4 : 8, DAG);
10345
10346 // The splat index for permuted loads will be in the left half of the vector
10347 // which is strictly wider than the loaded value by 8 bytes. So we need to
10348 // adjust the splat index to point to the correct address in memory.
10349 if (IsPermutedLoad) {
10350 assert((isLittleEndian || IsFourByte) &&
10351 "Unexpected size for permuted load on big endian target");
10352 SplatIdx += IsFourByte ? 2 : 1;
10353 assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
10354 "Splat of a value outside of the loaded memory");
10355 }
10356
10357 LoadSDNode *LD = cast<LoadSDNode>(Val: *InputLoad);
10358 // For 4-byte load-and-splat, we need Power9.
10359 if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
10360 uint64_t Offset = 0;
10361 if (IsFourByte)
10362 Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
10363 else
10364 Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
10365
10366 // If the width of the load is the same as the width of the splat,
10367 // loading with an offset would load the wrong memory.
10368 if (LD->getValueType(ResNo: 0).getSizeInBits() == (IsFourByte ? 32 : 64))
10369 Offset = 0;
10370
10371 SDValue BasePtr = LD->getBasePtr();
10372 if (Offset != 0)
10373 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
10374 N1: BasePtr, N2: DAG.getIntPtrConstant(Val: Offset, DL: dl));
10375 SDValue Ops[] = {
10376 LD->getChain(), // Chain
10377 BasePtr, // BasePtr
10378 DAG.getValueType(Op.getValueType()) // VT
10379 };
10380 SDVTList VTL =
10381 DAG.getVTList(VT1: IsFourByte ? MVT::v4i32 : MVT::v2i64, VT2: MVT::Other);
10382 SDValue LdSplt =
10383 DAG.getMemIntrinsicNode(Opcode: PPCISD::LD_SPLAT, dl, VTList: VTL,
10384 Ops, MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
10385 DAG.ReplaceAllUsesOfValueWith(From: InputLoad->getValue(R: 1), To: LdSplt.getValue(R: 1));
10386 if (LdSplt.getValueType() != SVOp->getValueType(ResNo: 0))
10387 LdSplt = DAG.getBitcast(VT: SVOp->getValueType(ResNo: 0), V: LdSplt);
10388 return LdSplt;
10389 }
10390 }
10391
10392 // All v2i64 and v2f64 shuffles are legal
10393 if (VT == MVT::v2i64 || VT == MVT::v2f64)
10394 return Op;
10395
10396 if (Subtarget.hasP9Vector() &&
10397 PPC::isXXINSERTWMask(N: SVOp, ShiftElts, InsertAtByte, Swap,
10398 IsLE: isLittleEndian)) {
10399 if (V2.isUndef())
10400 V2 = V1;
10401 else if (Swap)
10402 std::swap(a&: V1, b&: V2);
10403 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10404 SDValue Conv2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V2);
10405 if (ShiftElts) {
10406 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v4i32, N1: Conv2, N2: Conv2,
10407 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10408 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Shl,
10409 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10410 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10411 }
10412 SDValue Ins = DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Conv2,
10413 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
10414 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Ins);
10415 }
10416
10417 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
10418 SDValue SplatInsertNode;
10419 if ((SplatInsertNode = lowerToXXSPLTI32DX(SVN: SVOp, DAG)))
10420 return SplatInsertNode;
10421 }
10422
10423 if (Subtarget.hasP9Altivec()) {
10424 SDValue NewISDNode;
10425 if ((NewISDNode = lowerToVINSERTH(N: SVOp, DAG)))
10426 return NewISDNode;
10427
10428 if ((NewISDNode = lowerToVINSERTB(N: SVOp, DAG)))
10429 return NewISDNode;
10430 }
10431
10432 if (Subtarget.hasVSX() &&
10433 PPC::isXXSLDWIShuffleMask(N: SVOp, ShiftElts, Swap, IsLE: isLittleEndian)) {
10434 if (Swap)
10435 std::swap(a&: V1, b&: V2);
10436 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10437 SDValue Conv2 =
10438 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V2.isUndef() ? V1 : V2);
10439
10440 SDValue Shl = DAG.getNode(Opcode: PPCISD::VECSHL, DL: dl, VT: MVT::v4i32, N1: Conv1, N2: Conv2,
10441 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10442 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Shl);
10443 }
10444
10445 if (Subtarget.hasVSX() &&
10446 PPC::isXXPERMDIShuffleMask(N: SVOp, DM&: ShiftElts, Swap, IsLE: isLittleEndian)) {
10447 if (Swap)
10448 std::swap(a&: V1, b&: V2);
10449 SDValue Conv1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V1);
10450 SDValue Conv2 =
10451 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V2.isUndef() ? V1 : V2);
10452
10453 SDValue PermDI = DAG.getNode(Opcode: PPCISD::XXPERMDI, DL: dl, VT: MVT::v2i64, N1: Conv1, N2: Conv2,
10454 N3: DAG.getConstant(Val: ShiftElts, DL: dl, VT: MVT::i32));
10455 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: PermDI);
10456 }
10457
10458 if (Subtarget.hasP9Vector()) {
10459 if (PPC::isXXBRHShuffleMask(N: SVOp)) {
10460 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: V1);
10461 SDValue ReveHWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v8i16, Operand: Conv);
10462 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveHWord);
10463 } else if (PPC::isXXBRWShuffleMask(N: SVOp)) {
10464 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10465 SDValue ReveWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v4i32, Operand: Conv);
10466 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveWord);
10467 } else if (PPC::isXXBRDShuffleMask(N: SVOp)) {
10468 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2i64, Operand: V1);
10469 SDValue ReveDWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v2i64, Operand: Conv);
10470 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveDWord);
10471 } else if (PPC::isXXBRQShuffleMask(N: SVOp)) {
10472 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i128, Operand: V1);
10473 SDValue ReveQWord = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v1i128, Operand: Conv);
10474 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: ReveQWord);
10475 }
10476 }
10477
10478 if (Subtarget.hasVSX()) {
10479 if (V2.isUndef() && PPC::isSplatShuffleMask(N: SVOp, EltSize: 4)) {
10480 int SplatIdx = PPC::getSplatIdxForPPCMnemonics(N: SVOp, EltSize: 4, DAG);
10481
10482 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: V1);
10483 SDValue Splat = DAG.getNode(Opcode: PPCISD::XXSPLT, DL: dl, VT: MVT::v4i32, N1: Conv,
10484 N2: DAG.getConstant(Val: SplatIdx, DL: dl, VT: MVT::i32));
10485 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Splat);
10486 }
10487
10488 // Left shifts of 8 bytes are actually swaps. Convert accordingly.
10489 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind: 1, DAG) == 8) {
10490 SDValue Conv = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2f64, Operand: V1);
10491 SDValue Swap = DAG.getNode(Opcode: PPCISD::SWAP_NO_CHAIN, DL: dl, VT: MVT::v2f64, Operand: Conv);
10492 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: Swap);
10493 }
10494 }
10495
10496 // Cases that are handled by instructions that take permute immediates
10497 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
10498 // selected by the instruction selector.
10499 if (V2.isUndef()) {
10500 if (PPC::isSplatShuffleMask(N: SVOp, EltSize: 1) ||
10501 PPC::isSplatShuffleMask(N: SVOp, EltSize: 2) ||
10502 PPC::isSplatShuffleMask(N: SVOp, EltSize: 4) ||
10503 PPC::isVPKUWUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10504 PPC::isVPKUHUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10505 PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind: 1, DAG) != -1 ||
10506 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind: 1, DAG) ||
10507 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind: 1, DAG) ||
10508 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind: 1, DAG) ||
10509 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind: 1, DAG) ||
10510 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind: 1, DAG) ||
10511 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind: 1, DAG) ||
10512 (Subtarget.hasP8Altivec() && (
10513 PPC::isVPKUDUMShuffleMask(N: SVOp, ShuffleKind: 1, DAG) ||
10514 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: true, ShuffleKind: 1, DAG) ||
10515 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: false, ShuffleKind: 1, DAG)))) {
10516 return Op;
10517 }
10518 }
10519
10520 // Altivec has a variety of "shuffle immediates" that take two vector inputs
10521 // and produce a fixed permutation. If any of these match, do not lower to
10522 // VPERM.
10523 unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
10524 if (PPC::isVPKUWUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10525 PPC::isVPKUHUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10526 PPC::isVSLDOIShuffleMask(N: SVOp, ShuffleKind, DAG) != -1 ||
10527 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind, DAG) ||
10528 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind, DAG) ||
10529 PPC::isVMRGLShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind, DAG) ||
10530 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 1, ShuffleKind, DAG) ||
10531 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 2, ShuffleKind, DAG) ||
10532 PPC::isVMRGHShuffleMask(N: SVOp, UnitSize: 4, ShuffleKind, DAG) ||
10533 (Subtarget.hasP8Altivec() && (
10534 PPC::isVPKUDUMShuffleMask(N: SVOp, ShuffleKind, DAG) ||
10535 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: true, ShuffleKind, DAG) ||
10536 PPC::isVMRGEOShuffleMask(N: SVOp, CheckEven: false, ShuffleKind, DAG))))
10537 return Op;
10538
10539 // Check to see if this is a shuffle of 4-byte values. If so, we can use our
10540 // perfect shuffle table to emit an optimal matching sequence.
10541 ArrayRef<int> PermMask = SVOp->getMask();
10542
10543 if (!DisablePerfectShuffle && !isLittleEndian) {
10544 unsigned PFIndexes[4];
10545 bool isFourElementShuffle = true;
10546 for (unsigned i = 0; i != 4 && isFourElementShuffle;
10547 ++i) { // Element number
10548 unsigned EltNo = 8; // Start out undef.
10549 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
10550 if (PermMask[i * 4 + j] < 0)
10551 continue; // Undef, ignore it.
10552
10553 unsigned ByteSource = PermMask[i * 4 + j];
10554 if ((ByteSource & 3) != j) {
10555 isFourElementShuffle = false;
10556 break;
10557 }
10558
10559 if (EltNo == 8) {
10560 EltNo = ByteSource / 4;
10561 } else if (EltNo != ByteSource / 4) {
10562 isFourElementShuffle = false;
10563 break;
10564 }
10565 }
10566 PFIndexes[i] = EltNo;
10567 }
10568
10569 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
10570 // perfect shuffle vector to determine if it is cost effective to do this as
10571 // discrete instructions, or whether we should use a vperm.
10572 // For now, we skip this for little endian until such time as we have a
10573 // little-endian perfect shuffle table.
10574 if (isFourElementShuffle) {
10575 // Compute the index in the perfect shuffle table.
10576 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10577 PFIndexes[2] * 9 + PFIndexes[3];
10578
10579 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10580 unsigned Cost = (PFEntry >> 30);
10581
10582 // Determining when to avoid vperm is tricky. Many things affect the cost
10583 // of vperm, particularly how many times the perm mask needs to be
10584 // computed. For example, if the perm mask can be hoisted out of a loop or
10585 // is already used (perhaps because there are multiple permutes with the
10586 // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the
10587 // permute mask out of the loop requires an extra register.
10588 //
10589 // As a compromise, we only emit discrete instructions if the shuffle can
10590 // be generated in 3 or fewer operations. When we have loop information
10591 // available, if this block is within a loop, we should avoid using vperm
10592 // for 3-operation perms and use a constant pool load instead.
10593 if (Cost < 3)
10594 return GeneratePerfectShuffle(PFEntry, LHS: V1, RHS: V2, DAG, dl);
10595 }
10596 }
10597
10598 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
10599 // vector that will get spilled to the constant pool.
10600 if (V2.isUndef()) V2 = V1;
10601
10602 return LowerVPERM(Op, DAG, PermMask, VT, V1, V2);
10603}
10604
10605SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
10606 ArrayRef<int> PermMask, EVT VT,
10607 SDValue V1, SDValue V2) const {
10608 unsigned Opcode = PPCISD::VPERM;
10609 EVT ValType = V1.getValueType();
10610 SDLoc dl(Op);
10611 bool NeedSwap = false;
10612 bool isLittleEndian = Subtarget.isLittleEndian();
10613 bool isPPC64 = Subtarget.isPPC64();
10614
10615 if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
10616 (V1->hasOneUse() || V2->hasOneUse())) {
10617 LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
10618 "XXPERM instead\n");
10619 Opcode = PPCISD::XXPERM;
10620
10621 // The second input to XXPERM is also an output so if the second input has
10622 // multiple uses then copying is necessary, as a result we want the
10623 // single-use operand to be used as the second input to prevent copying.
10624 if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
10625 (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
10626 std::swap(a&: V1, b&: V2);
10627 NeedSwap = !NeedSwap;
10628 }
10629 }
10630
10631 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
10632 // that it is in input element units, not in bytes. Convert now.
10633
10634 // For little endian, the order of the input vectors is reversed, and
10635 // the permutation mask is complemented with respect to 31. This is
10636 // necessary to produce proper semantics with the big-endian-based vperm
10637 // instruction.
10638 EVT EltVT = V1.getValueType().getVectorElementType();
10639 unsigned BytesPerElement = EltVT.getSizeInBits() / 8;
10640
10641 bool V1HasXXSWAPD = V1->getOperand(Num: 0)->getOpcode() == PPCISD::XXSWAPD;
10642 bool V2HasXXSWAPD = V2->getOperand(Num: 0)->getOpcode() == PPCISD::XXSWAPD;
10643
10644 /*
10645 Vectors will be appended like so: [ V1 | v2 ]
10646 XXSWAPD on V1:
10647 [ A | B | C | D ] -> [ C | D | A | B ]
10648 0-3 4-7 8-11 12-15 0-3 4-7 8-11 12-15
10649 i.e. index of A, B += 8, and index of C, D -= 8.
10650 XXSWAPD on V2:
10651 [ E | F | G | H ] -> [ G | H | E | F ]
10652 16-19 20-23 24-27 28-31 16-19 20-23 24-27 28-31
10653 i.e. index of E, F += 8, index of G, H -= 8
10654 Swap V1 and V2:
10655 [ V1 | V2 ] -> [ V2 | V1 ]
10656 0-15 16-31 0-15 16-31
10657 i.e. index of V1 += 16, index of V2 -= 16
10658 */
10659
10660 SmallVector<SDValue, 16> ResultMask;
10661 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
10662 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
10663
10664 if (V1HasXXSWAPD) {
10665 if (SrcElt < 8)
10666 SrcElt += 8;
10667 else if (SrcElt < 16)
10668 SrcElt -= 8;
10669 }
10670 if (V2HasXXSWAPD) {
10671 if (SrcElt > 23)
10672 SrcElt -= 8;
10673 else if (SrcElt > 15)
10674 SrcElt += 8;
10675 }
10676 if (NeedSwap) {
10677 if (SrcElt < 16)
10678 SrcElt += 16;
10679 else
10680 SrcElt -= 16;
10681 }
10682 for (unsigned j = 0; j != BytesPerElement; ++j)
10683 if (isLittleEndian)
10684 ResultMask.push_back(
10685 Elt: DAG.getConstant(Val: 31 - (SrcElt * BytesPerElement + j), DL: dl, VT: MVT::i32));
10686 else
10687 ResultMask.push_back(
10688 Elt: DAG.getConstant(Val: SrcElt * BytesPerElement + j, DL: dl, VT: MVT::i32));
10689 }
10690
10691 if (V1HasXXSWAPD) {
10692 dl = SDLoc(V1->getOperand(Num: 0));
10693 V1 = V1->getOperand(Num: 0)->getOperand(Num: 1);
10694 }
10695 if (V2HasXXSWAPD) {
10696 dl = SDLoc(V2->getOperand(Num: 0));
10697 V2 = V2->getOperand(Num: 0)->getOperand(Num: 1);
10698 }
10699
10700 if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10701 if (ValType != MVT::v2f64)
10702 V1 = DAG.getBitcast(VT: MVT::v2f64, V: V1);
10703 if (V2.getValueType() != MVT::v2f64)
10704 V2 = DAG.getBitcast(VT: MVT::v2f64, V: V2);
10705 }
10706
10707 ShufflesHandledWithVPERM++;
10708 SDValue VPermMask = DAG.getBuildVector(VT: MVT::v16i8, DL: dl, Ops: ResultMask);
10709 LLVM_DEBUG({
10710 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10711 if (Opcode == PPCISD::XXPERM) {
10712 dbgs() << "Emitting a XXPERM for the following shuffle:\n";
10713 } else {
10714 dbgs() << "Emitting a VPERM for the following shuffle:\n";
10715 }
10716 SVOp->dump();
10717 dbgs() << "With the following permute control vector:\n";
10718 VPermMask.dump();
10719 });
10720
10721 if (Opcode == PPCISD::XXPERM)
10722 VPermMask = DAG.getBitcast(VT: MVT::v4i32, V: VPermMask);
10723
10724 // Only need to place items backwards in LE,
10725 // the mask was properly calculated.
10726 if (isLittleEndian)
10727 std::swap(a&: V1, b&: V2);
10728
10729 SDValue VPERMNode =
10730 DAG.getNode(Opcode, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2, N3: VPermMask);
10731
10732 VPERMNode = DAG.getBitcast(VT: ValType, V: VPERMNode);
10733 return VPERMNode;
10734}
10735
10736/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
10737/// vector comparison. If it is, return true and fill in Opc/isDot with
10738/// information about the intrinsic.
10739static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
10740 bool &isDot, const PPCSubtarget &Subtarget) {
10741 unsigned IntrinsicID = Intrin.getConstantOperandVal(i: 0);
10742 CompareOpc = -1;
10743 isDot = false;
10744 switch (IntrinsicID) {
10745 default:
10746 return false;
10747 // Comparison predicates.
10748 case Intrinsic::ppc_altivec_vcmpbfp_p:
10749 CompareOpc = 966;
10750 isDot = true;
10751 break;
10752 case Intrinsic::ppc_altivec_vcmpeqfp_p:
10753 CompareOpc = 198;
10754 isDot = true;
10755 break;
10756 case Intrinsic::ppc_altivec_vcmpequb_p:
10757 CompareOpc = 6;
10758 isDot = true;
10759 break;
10760 case Intrinsic::ppc_altivec_vcmpequh_p:
10761 CompareOpc = 70;
10762 isDot = true;
10763 break;
10764 case Intrinsic::ppc_altivec_vcmpequw_p:
10765 CompareOpc = 134;
10766 isDot = true;
10767 break;
10768 case Intrinsic::ppc_altivec_vcmpequd_p:
10769 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10770 CompareOpc = 199;
10771 isDot = true;
10772 } else
10773 return false;
10774 break;
10775 case Intrinsic::ppc_altivec_vcmpneb_p:
10776 case Intrinsic::ppc_altivec_vcmpneh_p:
10777 case Intrinsic::ppc_altivec_vcmpnew_p:
10778 case Intrinsic::ppc_altivec_vcmpnezb_p:
10779 case Intrinsic::ppc_altivec_vcmpnezh_p:
10780 case Intrinsic::ppc_altivec_vcmpnezw_p:
10781 if (Subtarget.hasP9Altivec()) {
10782 switch (IntrinsicID) {
10783 default:
10784 llvm_unreachable("Unknown comparison intrinsic.");
10785 case Intrinsic::ppc_altivec_vcmpneb_p:
10786 CompareOpc = 7;
10787 break;
10788 case Intrinsic::ppc_altivec_vcmpneh_p:
10789 CompareOpc = 71;
10790 break;
10791 case Intrinsic::ppc_altivec_vcmpnew_p:
10792 CompareOpc = 135;
10793 break;
10794 case Intrinsic::ppc_altivec_vcmpnezb_p:
10795 CompareOpc = 263;
10796 break;
10797 case Intrinsic::ppc_altivec_vcmpnezh_p:
10798 CompareOpc = 327;
10799 break;
10800 case Intrinsic::ppc_altivec_vcmpnezw_p:
10801 CompareOpc = 391;
10802 break;
10803 }
10804 isDot = true;
10805 } else
10806 return false;
10807 break;
10808 case Intrinsic::ppc_altivec_vcmpgefp_p:
10809 CompareOpc = 454;
10810 isDot = true;
10811 break;
10812 case Intrinsic::ppc_altivec_vcmpgtfp_p:
10813 CompareOpc = 710;
10814 isDot = true;
10815 break;
10816 case Intrinsic::ppc_altivec_vcmpgtsb_p:
10817 CompareOpc = 774;
10818 isDot = true;
10819 break;
10820 case Intrinsic::ppc_altivec_vcmpgtsh_p:
10821 CompareOpc = 838;
10822 isDot = true;
10823 break;
10824 case Intrinsic::ppc_altivec_vcmpgtsw_p:
10825 CompareOpc = 902;
10826 isDot = true;
10827 break;
10828 case Intrinsic::ppc_altivec_vcmpgtsd_p:
10829 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10830 CompareOpc = 967;
10831 isDot = true;
10832 } else
10833 return false;
10834 break;
10835 case Intrinsic::ppc_altivec_vcmpgtub_p:
10836 CompareOpc = 518;
10837 isDot = true;
10838 break;
10839 case Intrinsic::ppc_altivec_vcmpgtuh_p:
10840 CompareOpc = 582;
10841 isDot = true;
10842 break;
10843 case Intrinsic::ppc_altivec_vcmpgtuw_p:
10844 CompareOpc = 646;
10845 isDot = true;
10846 break;
10847 case Intrinsic::ppc_altivec_vcmpgtud_p:
10848 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10849 CompareOpc = 711;
10850 isDot = true;
10851 } else
10852 return false;
10853 break;
10854
10855 case Intrinsic::ppc_altivec_vcmpequq:
10856 case Intrinsic::ppc_altivec_vcmpgtsq:
10857 case Intrinsic::ppc_altivec_vcmpgtuq:
10858 if (!Subtarget.isISA3_1())
10859 return false;
10860 switch (IntrinsicID) {
10861 default:
10862 llvm_unreachable("Unknown comparison intrinsic.");
10863 case Intrinsic::ppc_altivec_vcmpequq:
10864 CompareOpc = 455;
10865 break;
10866 case Intrinsic::ppc_altivec_vcmpgtsq:
10867 CompareOpc = 903;
10868 break;
10869 case Intrinsic::ppc_altivec_vcmpgtuq:
10870 CompareOpc = 647;
10871 break;
10872 }
10873 break;
10874
10875 // VSX predicate comparisons use the same infrastructure
10876 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10877 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10878 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10879 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10880 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10881 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10882 if (Subtarget.hasVSX()) {
10883 switch (IntrinsicID) {
10884 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10885 CompareOpc = 99;
10886 break;
10887 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10888 CompareOpc = 115;
10889 break;
10890 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10891 CompareOpc = 107;
10892 break;
10893 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10894 CompareOpc = 67;
10895 break;
10896 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10897 CompareOpc = 83;
10898 break;
10899 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10900 CompareOpc = 75;
10901 break;
10902 }
10903 isDot = true;
10904 } else
10905 return false;
10906 break;
10907
10908 // Normal Comparisons.
10909 case Intrinsic::ppc_altivec_vcmpbfp:
10910 CompareOpc = 966;
10911 break;
10912 case Intrinsic::ppc_altivec_vcmpeqfp:
10913 CompareOpc = 198;
10914 break;
10915 case Intrinsic::ppc_altivec_vcmpequb:
10916 CompareOpc = 6;
10917 break;
10918 case Intrinsic::ppc_altivec_vcmpequh:
10919 CompareOpc = 70;
10920 break;
10921 case Intrinsic::ppc_altivec_vcmpequw:
10922 CompareOpc = 134;
10923 break;
10924 case Intrinsic::ppc_altivec_vcmpequd:
10925 if (Subtarget.hasP8Altivec())
10926 CompareOpc = 199;
10927 else
10928 return false;
10929 break;
10930 case Intrinsic::ppc_altivec_vcmpneb:
10931 case Intrinsic::ppc_altivec_vcmpneh:
10932 case Intrinsic::ppc_altivec_vcmpnew:
10933 case Intrinsic::ppc_altivec_vcmpnezb:
10934 case Intrinsic::ppc_altivec_vcmpnezh:
10935 case Intrinsic::ppc_altivec_vcmpnezw:
10936 if (Subtarget.hasP9Altivec())
10937 switch (IntrinsicID) {
10938 default:
10939 llvm_unreachable("Unknown comparison intrinsic.");
10940 case Intrinsic::ppc_altivec_vcmpneb:
10941 CompareOpc = 7;
10942 break;
10943 case Intrinsic::ppc_altivec_vcmpneh:
10944 CompareOpc = 71;
10945 break;
10946 case Intrinsic::ppc_altivec_vcmpnew:
10947 CompareOpc = 135;
10948 break;
10949 case Intrinsic::ppc_altivec_vcmpnezb:
10950 CompareOpc = 263;
10951 break;
10952 case Intrinsic::ppc_altivec_vcmpnezh:
10953 CompareOpc = 327;
10954 break;
10955 case Intrinsic::ppc_altivec_vcmpnezw:
10956 CompareOpc = 391;
10957 break;
10958 }
10959 else
10960 return false;
10961 break;
10962 case Intrinsic::ppc_altivec_vcmpgefp:
10963 CompareOpc = 454;
10964 break;
10965 case Intrinsic::ppc_altivec_vcmpgtfp:
10966 CompareOpc = 710;
10967 break;
10968 case Intrinsic::ppc_altivec_vcmpgtsb:
10969 CompareOpc = 774;
10970 break;
10971 case Intrinsic::ppc_altivec_vcmpgtsh:
10972 CompareOpc = 838;
10973 break;
10974 case Intrinsic::ppc_altivec_vcmpgtsw:
10975 CompareOpc = 902;
10976 break;
10977 case Intrinsic::ppc_altivec_vcmpgtsd:
10978 if (Subtarget.hasP8Altivec())
10979 CompareOpc = 967;
10980 else
10981 return false;
10982 break;
10983 case Intrinsic::ppc_altivec_vcmpgtub:
10984 CompareOpc = 518;
10985 break;
10986 case Intrinsic::ppc_altivec_vcmpgtuh:
10987 CompareOpc = 582;
10988 break;
10989 case Intrinsic::ppc_altivec_vcmpgtuw:
10990 CompareOpc = 646;
10991 break;
10992 case Intrinsic::ppc_altivec_vcmpgtud:
10993 if (Subtarget.hasP8Altivec())
10994 CompareOpc = 711;
10995 else
10996 return false;
10997 break;
10998 case Intrinsic::ppc_altivec_vcmpequq_p:
10999 case Intrinsic::ppc_altivec_vcmpgtsq_p:
11000 case Intrinsic::ppc_altivec_vcmpgtuq_p:
11001 if (!Subtarget.isISA3_1())
11002 return false;
11003 switch (IntrinsicID) {
11004 default:
11005 llvm_unreachable("Unknown comparison intrinsic.");
11006 case Intrinsic::ppc_altivec_vcmpequq_p:
11007 CompareOpc = 455;
11008 break;
11009 case Intrinsic::ppc_altivec_vcmpgtsq_p:
11010 CompareOpc = 903;
11011 break;
11012 case Intrinsic::ppc_altivec_vcmpgtuq_p:
11013 CompareOpc = 647;
11014 break;
11015 }
11016 isDot = true;
11017 break;
11018 }
11019 return true;
11020}
11021
11022/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
11023/// lower, do it, otherwise return null.
11024SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
11025 SelectionDAG &DAG) const {
11026 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
11027
11028 SDLoc dl(Op);
11029 // Note: BCD instructions expect the immediate operand in vector form (v4i32),
11030 // but the builtin provides it as a scalar. To satisfy the instruction
11031 // encoding, we splat the scalar across all lanes using SPLAT_VECTOR.
11032 auto MapNodeWithSplatVector =
11033 [&](unsigned Opcode,
11034 std::initializer_list<SDValue> ExtraOps = {}) -> SDValue {
11035 SDValue SplatVal =
11036 DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: dl, VT: MVT::v4i32, Operand: Op.getOperand(i: 2));
11037
11038 SmallVector<SDValue, 4> Ops{SplatVal, Op.getOperand(i: 1)};
11039 Ops.append(in_start: ExtraOps.begin(), in_end: ExtraOps.end());
11040 return DAG.getNode(Opcode, DL: dl, VT: MVT::v16i8, Ops);
11041 };
11042
11043 switch (IntrinsicID) {
11044 case Intrinsic::thread_pointer:
11045 // Reads the thread pointer register, used for __builtin_thread_pointer.
11046 if (Subtarget.isPPC64())
11047 return DAG.getRegister(Reg: PPC::X13, VT: MVT::i64);
11048 return DAG.getRegister(Reg: PPC::R2, VT: MVT::i32);
11049
11050 case Intrinsic::ppc_rldimi: {
11051 assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");
11052 SDValue Src = Op.getOperand(i: 1);
11053 APInt Mask = Op.getConstantOperandAPInt(i: 4);
11054 if (Mask.isZero())
11055 return Op.getOperand(i: 2);
11056 if (Mask.isAllOnes())
11057 return DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src, N2: Op.getOperand(i: 3));
11058 uint64_t SH = Op.getConstantOperandVal(i: 3);
11059 unsigned MB = 0, ME = 0;
11060 if (!isRunOfOnes64(Val: Mask.getZExtValue(), MB, ME))
11061 report_fatal_error(reason: "invalid rldimi mask!");
11062 // rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
11063 if (ME < 63 - SH) {
11064 Src = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src,
11065 N2: DAG.getConstant(Val: ME + SH + 1, DL: dl, VT: MVT::i32));
11066 } else if (ME > 63 - SH) {
11067 Src = DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i64, N1: Src,
11068 N2: DAG.getConstant(Val: ME + SH - 63, DL: dl, VT: MVT::i32));
11069 }
11070 return SDValue(
11071 DAG.getMachineNode(Opcode: PPC::RLDIMI, dl, VT: MVT::i64,
11072 Ops: {Op.getOperand(i: 2), Src,
11073 DAG.getTargetConstant(Val: 63 - ME, DL: dl, VT: MVT::i32),
11074 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32)}),
11075 0);
11076 }
11077
11078 case Intrinsic::ppc_rlwimi: {
11079 APInt Mask = Op.getConstantOperandAPInt(i: 4);
11080 if (Mask.isZero())
11081 return Op.getOperand(i: 2);
11082 if (Mask.isAllOnes())
11083 return DAG.getNode(Opcode: ISD::ROTL, DL: dl, VT: MVT::i32, N1: Op.getOperand(i: 1),
11084 N2: Op.getOperand(i: 3));
11085 unsigned MB = 0, ME = 0;
11086 if (!isRunOfOnes(Val: Mask.getZExtValue(), MB, ME))
11087 report_fatal_error(reason: "invalid rlwimi mask!");
11088 return SDValue(DAG.getMachineNode(
11089 Opcode: PPC::RLWIMI, dl, VT: MVT::i32,
11090 Ops: {Op.getOperand(i: 2), Op.getOperand(i: 1), Op.getOperand(i: 3),
11091 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32),
11092 DAG.getTargetConstant(Val: ME, DL: dl, VT: MVT::i32)}),
11093 0);
11094 }
11095
11096 case Intrinsic::ppc_bcdshift:
11097 return MapNodeWithSplatVector(PPCISD::BCDSHIFT, {Op.getOperand(i: 3)});
11098 case Intrinsic::ppc_bcdshiftround:
11099 return MapNodeWithSplatVector(PPCISD::BCDSHIFTROUND, {Op.getOperand(i: 3)});
11100 case Intrinsic::ppc_bcdtruncate:
11101 return MapNodeWithSplatVector(PPCISD::BCDTRUNC, {Op.getOperand(i: 3)});
11102 case Intrinsic::ppc_bcdunsignedtruncate:
11103 return MapNodeWithSplatVector(PPCISD::BCDUTRUNC);
11104 case Intrinsic::ppc_bcdunsignedshift:
11105 return MapNodeWithSplatVector(PPCISD::BCDUSHIFT);
11106
11107 case Intrinsic::ppc_rlwnm: {
11108 if (Op.getConstantOperandVal(i: 3) == 0)
11109 return DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32);
11110 unsigned MB = 0, ME = 0;
11111 if (!isRunOfOnes(Val: Op.getConstantOperandVal(i: 3), MB, ME))
11112 report_fatal_error(reason: "invalid rlwnm mask!");
11113 return SDValue(
11114 DAG.getMachineNode(Opcode: PPC::RLWNM, dl, VT: MVT::i32,
11115 Ops: {Op.getOperand(i: 1), Op.getOperand(i: 2),
11116 DAG.getTargetConstant(Val: MB, DL: dl, VT: MVT::i32),
11117 DAG.getTargetConstant(Val: ME, DL: dl, VT: MVT::i32)}),
11118 0);
11119 }
11120
11121 case Intrinsic::ppc_mma_disassemble_acc: {
11122 if (Subtarget.isISAFuture()) {
11123 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11124 SDValue WideVec =
11125 SDValue(DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes,
11126 Ops: Op.getOperand(i: 1)),
11127 0);
11128 SmallVector<SDValue, 4> RetOps;
11129 SDValue Value = SDValue(WideVec.getNode(), 0);
11130 SDValue Value2 = SDValue(WideVec.getNode(), 1);
11131
11132 SDValue Extract;
11133 Extract = DAG.getNode(
11134 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11135 N1: Subtarget.isLittleEndian() ? Value2 : Value,
11136 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 1 : 0,
11137 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11138 RetOps.push_back(Elt: Extract);
11139 Extract = DAG.getNode(
11140 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11141 N1: Subtarget.isLittleEndian() ? Value2 : Value,
11142 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 0 : 1,
11143 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11144 RetOps.push_back(Elt: Extract);
11145 Extract = DAG.getNode(
11146 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11147 N1: Subtarget.isLittleEndian() ? Value : Value2,
11148 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 1 : 0,
11149 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11150 RetOps.push_back(Elt: Extract);
11151 Extract = DAG.getNode(
11152 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
11153 N1: Subtarget.isLittleEndian() ? Value : Value2,
11154 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? 0 : 1,
11155 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11156 RetOps.push_back(Elt: Extract);
11157 return DAG.getMergeValues(Ops: RetOps, dl);
11158 }
11159 [[fallthrough]];
11160 }
11161 case Intrinsic::ppc_vsx_disassemble_pair: {
11162 int NumVecs = 2;
11163 SDValue WideVec = Op.getOperand(i: 1);
11164 if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
11165 NumVecs = 4;
11166 WideVec = DAG.getNode(Opcode: PPCISD::XXMFACC, DL: dl, VT: MVT::v512i1, Operand: WideVec);
11167 }
11168 SmallVector<SDValue, 4> RetOps;
11169 for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
11170 SDValue Extract = DAG.getNode(
11171 Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8, N1: WideVec,
11172 N2: DAG.getConstant(Val: Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
11173 : VecNo,
11174 DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
11175 RetOps.push_back(Elt: Extract);
11176 }
11177 return DAG.getMergeValues(Ops: RetOps, dl);
11178 }
11179
11180 case Intrinsic::ppc_mma_build_dmr: {
11181 SmallVector<SDValue, 8> Pairs;
11182 SmallVector<SDValue, 8> Chains;
11183 for (int i = 1; i < 9; i += 2) {
11184 SDValue Hi = Op.getOperand(i);
11185 SDValue Lo = Op.getOperand(i: i + 1);
11186 if (Hi->getOpcode() == ISD::LOAD)
11187 Chains.push_back(Elt: Hi.getValue(R: 1));
11188 if (Lo->getOpcode() == ISD::LOAD)
11189 Chains.push_back(Elt: Lo.getValue(R: 1));
11190 Pairs.push_back(
11191 Elt: DAG.getNode(Opcode: PPCISD::PAIR_BUILD, DL: dl, VT: MVT::v256i1, Ops: {Hi, Lo}));
11192 }
11193 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Chains);
11194 SDValue Value = DMFInsert1024(Pairs, dl: SDLoc(Op), DAG);
11195 return DAG.getMergeValues(Ops: {Value, TF}, dl);
11196 }
11197
11198 case Intrinsic::ppc_mma_dmxxextfdmr512: {
11199 assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
11200 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11201 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11202 "Specify P of 0 or 1 for lower or upper 512 bytes");
11203 unsigned HiLo = Idx->getSExtValue();
11204 unsigned Opcode;
11205 unsigned Subx;
11206 if (HiLo == 0) {
11207 Opcode = PPC::DMXXEXTFDMR512;
11208 Subx = PPC::sub_wacc_lo;
11209 } else {
11210 Opcode = PPC::DMXXEXTFDMR512_HI;
11211 Subx = PPC::sub_wacc_hi;
11212 }
11213 SDValue Subreg(
11214 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
11215 Op1: Op.getOperand(i: 1),
11216 Op2: DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32)),
11217 0);
11218 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11219 return SDValue(DAG.getMachineNode(Opcode, dl, ResultTys: ReturnTypes, Ops: Subreg), 0);
11220 }
11221
11222 case Intrinsic::ppc_mma_dmxxextfdmr256: {
11223 assert(Subtarget.isISAFuture() && "dmxxextfdmr256 requires ISA Future");
11224 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11225 assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
11226 "Specify a dmr row pair 0-3");
11227 unsigned IdxVal = Idx->getSExtValue();
11228 unsigned Subx;
11229 switch (IdxVal) {
11230 case 0:
11231 Subx = PPC::sub_dmrrowp0;
11232 break;
11233 case 1:
11234 Subx = PPC::sub_dmrrowp1;
11235 break;
11236 case 2:
11237 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
11238 break;
11239 case 3:
11240 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
11241 break;
11242 }
11243 SDValue Subreg(
11244 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v256i1,
11245 Op1: Op.getOperand(i: 1),
11246 Op2: DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32)),
11247 0);
11248 SDValue P = DAG.getTargetConstant(Val: IdxVal, DL: dl, VT: MVT::i32);
11249 return SDValue(
11250 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR256, dl, VT: MVT::v256i1, Ops: {Subreg, P}),
11251 0);
11252 }
11253
11254 case Intrinsic::ppc_mma_dmxxinstdmr512: {
11255 assert(Subtarget.isISAFuture() && "dmxxinstdmr512 requires ISA Future");
11256 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 4));
11257 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11258 "Specify P of 0 or 1 for lower or upper 512 bytes");
11259 unsigned HiLo = Idx->getSExtValue();
11260 unsigned Opcode;
11261 unsigned Subx;
11262 if (HiLo == 0) {
11263 Opcode = PPCISD::INST512;
11264 Subx = PPC::sub_wacc_lo;
11265 } else {
11266 Opcode = PPCISD::INST512HI;
11267 Subx = PPC::sub_wacc_hi;
11268 }
11269 SDValue Wacc = DAG.getNode(Opcode, DL: dl, VT: MVT::v512i1, N1: Op.getOperand(i: 2),
11270 N2: Op.getOperand(i: 3));
11271 SDValue SubReg = DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32);
11272 return SDValue(DAG.getMachineNode(Opcode: PPC::INSERT_SUBREG, dl, VT: MVT::v1024i1,
11273 Op1: Op.getOperand(i: 1), Op2: Wacc, Op3: SubReg),
11274 0);
11275 }
11276
11277 case Intrinsic::ppc_mma_dmxxinstdmr256: {
11278 assert(Subtarget.isISAFuture() && "dmxxinstdmr256 requires ISA Future");
11279 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
11280 assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
11281 "Specify a dmr row pair 0-3");
11282 unsigned IdxVal = Idx->getSExtValue();
11283 unsigned Subx;
11284 switch (IdxVal) {
11285 case 0:
11286 Subx = PPC::sub_dmrrowp0;
11287 break;
11288 case 1:
11289 Subx = PPC::sub_dmrrowp1;
11290 break;
11291 case 2:
11292 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
11293 break;
11294 case 3:
11295 Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
11296 break;
11297 }
11298 SDValue SubReg = DAG.getTargetConstant(Val: Subx, DL: dl, VT: MVT::i32);
11299 SDValue P = DAG.getTargetConstant(Val: IdxVal, DL: dl, VT: MVT::i32);
11300 SDValue DMRRowp =
11301 DAG.getNode(Opcode: PPCISD::INST256, DL: dl, VT: MVT::v256i1, N1: Op.getOperand(i: 2), N2: P);
11302 return SDValue(DAG.getMachineNode(Opcode: PPC::INSERT_SUBREG, dl, VT: MVT::v1024i1,
11303 Op1: Op.getOperand(i: 1), Op2: DMRRowp, Op3: SubReg),
11304 0);
11305 }
11306
11307 case Intrinsic::ppc_mma_xxmfacc:
11308 case Intrinsic::ppc_mma_xxmtacc: {
11309 // Allow pre-isa-future subtargets to lower as normal.
11310 if (!Subtarget.isISAFuture())
11311 return SDValue();
11312 // The intrinsics for xxmtacc and xxmfacc take one argument of
11313 // type v512i1, for future cpu the corresponding wacc instruction
11314 // dmxx[inst|extf]dmr512 is always generated for type v512i1, negating
11315 // the need to produce the xxm[t|f]acc.
11316 SDValue WideVec = Op.getOperand(i: 1);
11317 DAG.ReplaceAllUsesWith(From: Op, To: WideVec);
11318 return SDValue();
11319 }
11320
11321 case Intrinsic::ppc_unpack_longdouble: {
11322 auto *Idx = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11323 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11324 "Argument of long double unpack must be 0 or 1!");
11325 return DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: dl, VT: MVT::f64, N1: Op.getOperand(i: 1),
11326 N2: DAG.getConstant(Val: !!(Idx->getSExtValue()), DL: dl,
11327 VT: Idx->getValueType(ResNo: 0)));
11328 }
11329
11330 case Intrinsic::ppc_compare_exp_lt:
11331 case Intrinsic::ppc_compare_exp_gt:
11332 case Intrinsic::ppc_compare_exp_eq:
11333 case Intrinsic::ppc_compare_exp_uo: {
11334 unsigned Pred;
11335 switch (IntrinsicID) {
11336 case Intrinsic::ppc_compare_exp_lt:
11337 Pred = PPC::PRED_LT;
11338 break;
11339 case Intrinsic::ppc_compare_exp_gt:
11340 Pred = PPC::PRED_GT;
11341 break;
11342 case Intrinsic::ppc_compare_exp_eq:
11343 Pred = PPC::PRED_EQ;
11344 break;
11345 case Intrinsic::ppc_compare_exp_uo:
11346 Pred = PPC::PRED_UN;
11347 break;
11348 }
11349 return SDValue(
11350 DAG.getMachineNode(
11351 Opcode: PPC::SELECT_CC_I4, dl, VT: MVT::i32,
11352 Ops: {SDValue(DAG.getMachineNode(Opcode: PPC::XSCMPEXPDP, dl, VT: MVT::i32,
11353 Op1: Op.getOperand(i: 1), Op2: Op.getOperand(i: 2)),
11354 0),
11355 DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32), DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32),
11356 DAG.getTargetConstant(Val: Pred, DL: dl, VT: MVT::i32)}),
11357 0);
11358 }
11359 case Intrinsic::ppc_test_data_class: {
11360 EVT OpVT = Op.getOperand(i: 1).getValueType();
11361 unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP
11362 : (OpVT == MVT::f64 ? PPC::XSTSTDCDP
11363 : PPC::XSTSTDCSP);
11364 return SDValue(
11365 DAG.getMachineNode(
11366 Opcode: PPC::SELECT_CC_I4, dl, VT: MVT::i32,
11367 Ops: {SDValue(DAG.getMachineNode(Opcode: CmprOpc, dl, VT: MVT::i32, Op1: Op.getOperand(i: 2),
11368 Op2: Op.getOperand(i: 1)),
11369 0),
11370 DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32), DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32),
11371 DAG.getTargetConstant(Val: PPC::PRED_EQ, DL: dl, VT: MVT::i32)}),
11372 0);
11373 }
11374 case Intrinsic::ppc_fnmsub: {
11375 EVT VT = Op.getOperand(i: 1).getValueType();
11376 if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
11377 return DAG.getNode(
11378 Opcode: ISD::FNEG, DL: dl, VT,
11379 Operand: DAG.getNode(Opcode: ISD::FMA, DL: dl, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
11380 N3: DAG.getNode(Opcode: ISD::FNEG, DL: dl, VT, Operand: Op.getOperand(i: 3))));
11381 return DAG.getNode(Opcode: PPCISD::FNMSUB, DL: dl, VT, N1: Op.getOperand(i: 1),
11382 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11383 }
11384 case Intrinsic::ppc_convert_f128_to_ppcf128:
11385 case Intrinsic::ppc_convert_ppcf128_to_f128: {
11386 RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
11387 ? RTLIB::CONVERT_PPCF128_F128
11388 : RTLIB::CONVERT_F128_PPCF128;
11389 MakeLibCallOptions CallOptions;
11390 std::pair<SDValue, SDValue> Result =
11391 makeLibCall(DAG, LC, RetVT: Op.getValueType(), Ops: Op.getOperand(i: 1), CallOptions,
11392 dl, Chain: SDValue());
11393 return Result.first;
11394 }
11395 case Intrinsic::ppc_maxfe:
11396 case Intrinsic::ppc_maxfl:
11397 case Intrinsic::ppc_maxfs:
11398 case Intrinsic::ppc_minfe:
11399 case Intrinsic::ppc_minfl:
11400 case Intrinsic::ppc_minfs: {
11401 EVT VT = Op.getValueType();
11402 assert(
11403 all_of(Op->ops().drop_front(4),
11404 [VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&
11405 "ppc_[max|min]f[e|l|s] must have uniform type arguments");
11406 (void)VT;
11407 ISD::CondCode CC = ISD::SETGT;
11408 if (IntrinsicID == Intrinsic::ppc_minfe ||
11409 IntrinsicID == Intrinsic::ppc_minfl ||
11410 IntrinsicID == Intrinsic::ppc_minfs)
11411 CC = ISD::SETLT;
11412 unsigned I = Op.getNumOperands() - 2, Cnt = I;
11413 SDValue Res = Op.getOperand(i: I);
11414 for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {
11415 Res =
11416 DAG.getSelectCC(DL: dl, LHS: Res, RHS: Op.getOperand(i: I), True: Res, False: Op.getOperand(i: I), Cond: CC);
11417 }
11418 return Res;
11419 }
11420 }
11421
11422 // If this is a lowered altivec predicate compare, CompareOpc is set to the
11423 // opcode number of the comparison.
11424 int CompareOpc;
11425 bool isDot;
11426 if (!getVectorCompareInfo(Intrin: Op, CompareOpc, isDot, Subtarget))
11427 return SDValue(); // Don't custom lower most intrinsics.
11428
11429 // If this is a non-dot comparison, make the VCMP node and we are done.
11430 if (!isDot) {
11431 SDValue Tmp = DAG.getNode(Opcode: PPCISD::VCMP, DL: dl, VT: Op.getOperand(i: 2).getValueType(),
11432 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
11433 N3: DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32));
11434 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Tmp);
11435 }
11436
11437 // Create the PPCISD altivec 'dot' comparison node.
11438 SDValue Ops[] = {
11439 Op.getOperand(i: 2), // LHS
11440 Op.getOperand(i: 3), // RHS
11441 DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32)
11442 };
11443 EVT VTs[] = { Op.getOperand(i: 2).getValueType(), MVT::Glue };
11444 SDValue CompNode = DAG.getNode(Opcode: PPCISD::VCMP_rec, DL: dl, ResultTys: VTs, Ops);
11445
11446 // Unpack the result based on how the target uses it.
11447 unsigned BitNo; // Bit # of CR6.
11448 bool InvertBit; // Invert result?
11449 unsigned Bitx;
11450 unsigned SetOp;
11451 switch (Op.getConstantOperandVal(i: 1)) {
11452 default: // Can't happen, don't crash on invalid number though.
11453 case 0: // Return the value of the EQ bit of CR6.
11454 BitNo = 0;
11455 InvertBit = false;
11456 Bitx = PPC::sub_eq;
11457 SetOp = PPCISD::SETBC;
11458 break;
11459 case 1: // Return the inverted value of the EQ bit of CR6.
11460 BitNo = 0;
11461 InvertBit = true;
11462 Bitx = PPC::sub_eq;
11463 SetOp = PPCISD::SETBCR;
11464 break;
11465 case 2: // Return the value of the LT bit of CR6.
11466 BitNo = 2;
11467 InvertBit = false;
11468 Bitx = PPC::sub_lt;
11469 SetOp = PPCISD::SETBC;
11470 break;
11471 case 3: // Return the inverted value of the LT bit of CR6.
11472 BitNo = 2;
11473 InvertBit = true;
11474 Bitx = PPC::sub_lt;
11475 SetOp = PPCISD::SETBCR;
11476 break;
11477 }
11478
11479 SDValue GlueOp = CompNode.getValue(R: 1);
11480 if (Subtarget.isISA3_1()) {
11481 SDValue SubRegIdx = DAG.getTargetConstant(Val: Bitx, DL: dl, VT: MVT::i32);
11482 SDValue CR6Reg = DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32);
11483 SDValue CRBit =
11484 SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i1,
11485 Op1: CR6Reg, Op2: SubRegIdx, Op3: GlueOp),
11486 0);
11487 return DAG.getNode(Opcode: SetOp, DL: dl, VT: MVT::i32, Operand: CRBit);
11488 }
11489
11490 // Now that we have the comparison, emit a copy from the CR to a GPR.
11491 // This is flagged to the above dot comparison.
11492 SDValue Flags = DAG.getNode(Opcode: PPCISD::MFOCRF, DL: dl, VT: MVT::i32,
11493 N1: DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32), N2: GlueOp);
11494
11495 // Shift the bit into the low position.
11496 Flags = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32, N1: Flags,
11497 N2: DAG.getConstant(Val: 8 - (3 - BitNo), DL: dl, VT: MVT::i32));
11498 // Isolate the bit.
11499 Flags = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: Flags,
11500 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
11501
11502 // If we are supposed to, toggle the bit.
11503 if (InvertBit)
11504 Flags = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i32, N1: Flags,
11505 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
11506 return Flags;
11507}
11508
11509SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11510 SelectionDAG &DAG) const {
11511 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
11512 // the beginning of the argument list.
11513 int ArgStart = isa<ConstantSDNode>(Val: Op.getOperand(i: 0)) ? 0 : 1;
11514 SDLoc DL(Op);
11515 switch (Op.getConstantOperandVal(i: ArgStart)) {
11516 case Intrinsic::ppc_cfence: {
11517 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
11518 SDValue Val = Op.getOperand(i: ArgStart + 1);
11519 EVT Ty = Val.getValueType();
11520 if (Ty == MVT::i128) {
11521 // FIXME: Testing one of two paired registers is sufficient to guarantee
11522 // ordering?
11523 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i64, Operand: Val);
11524 }
11525 unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE;
11526 return SDValue(
11527 DAG.getMachineNode(
11528 Opcode, dl: DL, VT: MVT::Other,
11529 Op1: DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: Subtarget.getScalarIntVT(), Operand: Val),
11530 Op2: Op.getOperand(i: 0)),
11531 0);
11532 }
11533 case Intrinsic::ppc_mma_disassemble_dmr: {
11534 return DAG.getStore(Chain: DAG.getEntryNode(), dl: DL, Val: Op.getOperand(i: ArgStart + 2),
11535 Ptr: Op.getOperand(i: ArgStart + 1), PtrInfo: MachinePointerInfo());
11536 }
11537 case Intrinsic::ppc_amo_stwat:
11538 case Intrinsic::ppc_amo_stdat: {
11539 SDLoc dl(Op);
11540 SDValue Chain = Op.getOperand(i: 0);
11541 SDValue Ptr = Op.getOperand(i: ArgStart + 1);
11542 SDValue Val = Op.getOperand(i: ArgStart + 2);
11543 SDValue FC = Op.getOperand(i: ArgStart + 3);
11544
11545 return DAG.getNode(Opcode: PPCISD::STAT, DL: dl, VT: MVT::Other, N1: Chain, N2: Val, N3: Ptr, N4: FC);
11546 }
11547 default:
11548 break;
11549 }
11550 return SDValue();
11551}
11552
11553// Lower scalar BSWAP64 to xxbrd.
11554SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
11555 SDLoc dl(Op);
11556 if (!Subtarget.isPPC64())
11557 return Op;
11558 // MTVSRDD
11559 Op = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: dl, VT: MVT::v2i64, N1: Op.getOperand(i: 0),
11560 N2: Op.getOperand(i: 0));
11561 // XXBRD
11562 Op = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::v2i64, Operand: Op);
11563 // MFVSRD
11564 int VectorIndex = 0;
11565 if (Subtarget.isLittleEndian())
11566 VectorIndex = 1;
11567 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
11568 N2: DAG.getTargetConstant(Val: VectorIndex, DL: dl, VT: MVT::i32));
11569 return Op;
11570}
11571
11572// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
11573// compared to a value that is atomically loaded (atomic loads zero-extend).
11574SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11575 SelectionDAG &DAG) const {
11576 assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
11577 "Expecting an atomic compare-and-swap here.");
11578 SDLoc dl(Op);
11579 auto *AtomicNode = cast<AtomicSDNode>(Val: Op.getNode());
11580 EVT MemVT = AtomicNode->getMemoryVT();
11581 if (MemVT.getSizeInBits() >= 32)
11582 return Op;
11583
11584 SDValue CmpOp = Op.getOperand(i: 2);
11585 // If this is already correctly zero-extended, leave it alone.
11586 auto HighBits = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - MemVT.getSizeInBits());
11587 if (DAG.MaskedValueIsZero(Op: CmpOp, Mask: HighBits))
11588 return Op;
11589
11590 // Clear the high bits of the compare operand.
11591 unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
11592 SDValue NewCmpOp =
11593 DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: CmpOp,
11594 N2: DAG.getConstant(Val: MaskVal, DL: dl, VT: MVT::i32));
11595
11596 // Replace the existing compare operand with the properly zero-extended one.
11597 SmallVector<SDValue, 4> Ops;
11598 for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
11599 Ops.push_back(Elt: AtomicNode->getOperand(Num: i));
11600 Ops[2] = NewCmpOp;
11601 MachineMemOperand *MMO = AtomicNode->getMemOperand();
11602 SDVTList Tys = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
11603 auto NodeTy =
11604 (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
11605 return DAG.getMemIntrinsicNode(Opcode: NodeTy, dl, VTList: Tys, Ops, MemVT, MMO);
11606}
11607
11608SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
11609 SelectionDAG &DAG) const {
11610 AtomicSDNode *N = cast<AtomicSDNode>(Val: Op.getNode());
11611 EVT MemVT = N->getMemoryVT();
11612 assert(MemVT.getSimpleVT() == MVT::i128 &&
11613 "Expect quadword atomic operations");
11614 SDLoc dl(N);
11615 unsigned Opc = N->getOpcode();
11616 switch (Opc) {
11617 case ISD::ATOMIC_LOAD: {
11618 // Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
11619 // lowered to ppc instructions by pattern matching instruction selector.
11620 SDVTList Tys = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other);
11621 SmallVector<SDValue, 4> Ops{
11622 N->getOperand(Num: 0),
11623 DAG.getConstant(Val: Intrinsic::ppc_atomic_load_i128, DL: dl, VT: MVT::i32)};
11624 for (int I = 1, E = N->getNumOperands(); I < E; ++I)
11625 Ops.push_back(Elt: N->getOperand(Num: I));
11626 SDValue LoadedVal = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl, VTList: Tys,
11627 Ops, MemVT, MMO: N->getMemOperand());
11628 SDValue ValLo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i128, Operand: LoadedVal);
11629 SDValue ValHi =
11630 DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i128, Operand: LoadedVal.getValue(R: 1));
11631 ValHi = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i128, N1: ValHi,
11632 N2: DAG.getConstant(Val: 64, DL: dl, VT: MVT::i32));
11633 SDValue Val =
11634 DAG.getNode(Opcode: ISD::OR, DL: dl, ResultTys: {MVT::i128, MVT::Other}, Ops: {ValLo, ValHi});
11635 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, ResultTys: {MVT::i128, MVT::Other},
11636 Ops: {Val, LoadedVal.getValue(R: 2)});
11637 }
11638 case ISD::ATOMIC_STORE: {
11639 // Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
11640 // lowered to ppc instructions by pattern matching instruction selector.
11641 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
11642 SmallVector<SDValue, 4> Ops{
11643 N->getOperand(Num: 0),
11644 DAG.getConstant(Val: Intrinsic::ppc_atomic_store_i128, DL: dl, VT: MVT::i32)};
11645 SDValue Val = N->getOperand(Num: 1);
11646 SDValue ValLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i64, Operand: Val);
11647 SDValue ValHi = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i128, N1: Val,
11648 N2: DAG.getConstant(Val: 64, DL: dl, VT: MVT::i32));
11649 ValHi = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i64, Operand: ValHi);
11650 Ops.push_back(Elt: ValLo);
11651 Ops.push_back(Elt: ValHi);
11652 Ops.push_back(Elt: N->getOperand(Num: 2));
11653 return DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops, MemVT,
11654 MMO: N->getMemOperand());
11655 }
11656 default:
11657 llvm_unreachable("Unexpected atomic opcode");
11658 }
11659}
11660
11661static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl,
11662 SelectionDAG &DAG,
11663 const PPCSubtarget &Subtarget) {
11664 assert(Mask <= fcAllFlags && "Invalid fp_class flags!");
11665
11666 enum DataClassMask {
11667 DC_NAN = 1 << 6,
11668 DC_NEG_INF = 1 << 4,
11669 DC_POS_INF = 1 << 5,
11670 DC_NEG_ZERO = 1 << 2,
11671 DC_POS_ZERO = 1 << 3,
11672 DC_NEG_SUBNORM = 1,
11673 DC_POS_SUBNORM = 1 << 1,
11674 };
11675
11676 EVT VT = Op.getValueType();
11677
11678 unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP
11679 : VT == MVT::f64 ? PPC::XSTSTDCDP
11680 : PPC::XSTSTDCSP;
11681
11682 if (Mask == fcAllFlags)
11683 return DAG.getBoolConstant(V: true, DL: Dl, VT: MVT::i1, OpVT: VT);
11684 if (Mask == 0)
11685 return DAG.getBoolConstant(V: false, DL: Dl, VT: MVT::i1, OpVT: VT);
11686
11687 // When it's cheaper or necessary to test reverse flags.
11688 if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) {
11689 SDValue Rev = getDataClassTest(Op, Mask: ~Mask, Dl, DAG, Subtarget);
11690 return DAG.getNOT(DL: Dl, Val: Rev, VT: MVT::i1);
11691 }
11692
11693 // Power doesn't support testing whether a value is 'normal'. Test the rest
11694 // first, and test if it's 'not not-normal' with expected sign.
11695 if (Mask & fcNormal) {
11696 SDValue Rev(DAG.getMachineNode(
11697 Opcode: TestOp, dl: Dl, VT: MVT::i32,
11698 Op1: DAG.getTargetConstant(Val: DC_NAN | DC_NEG_INF | DC_POS_INF |
11699 DC_NEG_ZERO | DC_POS_ZERO |
11700 DC_NEG_SUBNORM | DC_POS_SUBNORM,
11701 DL: Dl, VT: MVT::i32),
11702 Op2: Op),
11703 0);
11704 // Sign are stored in CR bit 0, result are in CR bit 2.
11705 SDValue Sign(
11706 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1, Op1: Rev,
11707 Op2: DAG.getTargetConstant(Val: PPC::sub_lt, DL: Dl, VT: MVT::i32)),
11708 0);
11709 SDValue Normal(DAG.getNOT(
11710 DL: Dl,
11711 Val: SDValue(DAG.getMachineNode(
11712 Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1, Op1: Rev,
11713 Op2: DAG.getTargetConstant(Val: PPC::sub_eq, DL: Dl, VT: MVT::i32)),
11714 0),
11715 VT: MVT::i1));
11716 if (Mask & fcPosNormal)
11717 Sign = DAG.getNOT(DL: Dl, Val: Sign, VT: MVT::i1);
11718 SDValue Result = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i1, N1: Sign, N2: Normal);
11719 if (Mask == fcPosNormal || Mask == fcNegNormal)
11720 return Result;
11721
11722 return DAG.getNode(
11723 Opcode: ISD::OR, DL: Dl, VT: MVT::i1,
11724 N1: getDataClassTest(Op, Mask: Mask & ~fcNormal, Dl, DAG, Subtarget), N2: Result);
11725 }
11726
11727 // The instruction doesn't differentiate between signaling or quiet NaN. Test
11728 // the rest first, and test if it 'is NaN and is signaling/quiet'.
11729 if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) {
11730 bool IsQuiet = Mask & fcQNan;
11731 SDValue NanCheck = getDataClassTest(Op, Mask: fcNan, Dl, DAG, Subtarget);
11732
11733 // Quietness is determined by the first bit in fraction field.
11734 uint64_t QuietMask = 0;
11735 SDValue HighWord;
11736 if (VT == MVT::f128) {
11737 HighWord = DAG.getNode(
11738 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: Dl, VT: MVT::i32, N1: DAG.getBitcast(VT: MVT::v4i32, V: Op),
11739 N2: DAG.getVectorIdxConstant(Val: Subtarget.isLittleEndian() ? 3 : 0, DL: Dl));
11740 QuietMask = 0x8000;
11741 } else if (VT == MVT::f64) {
11742 if (Subtarget.isPPC64()) {
11743 HighWord = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: Dl, VT: MVT::i32,
11744 N1: DAG.getBitcast(VT: MVT::i64, V: Op),
11745 N2: DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32));
11746 } else {
11747 SDValue Vec = DAG.getBitcast(
11748 VT: MVT::v4i32, V: DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: Dl, VT: MVT::v2f64, Operand: Op));
11749 HighWord = DAG.getNode(
11750 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: Dl, VT: MVT::i32, N1: Vec,
11751 N2: DAG.getVectorIdxConstant(Val: Subtarget.isLittleEndian() ? 1 : 0, DL: Dl));
11752 }
11753 QuietMask = 0x80000;
11754 } else if (VT == MVT::f32) {
11755 HighWord = DAG.getBitcast(VT: MVT::i32, V: Op);
11756 QuietMask = 0x400000;
11757 }
11758 SDValue NanRes = DAG.getSetCC(
11759 DL: Dl, VT: MVT::i1,
11760 LHS: DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i32, N1: HighWord,
11761 N2: DAG.getConstant(Val: QuietMask, DL: Dl, VT: MVT::i32)),
11762 RHS: DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i32), Cond: IsQuiet ? ISD::SETNE : ISD::SETEQ);
11763 NanRes = DAG.getNode(Opcode: ISD::AND, DL: Dl, VT: MVT::i1, N1: NanCheck, N2: NanRes);
11764 if (Mask == fcQNan || Mask == fcSNan)
11765 return NanRes;
11766
11767 return DAG.getNode(Opcode: ISD::OR, DL: Dl, VT: MVT::i1,
11768 N1: getDataClassTest(Op, Mask: Mask & ~fcNan, Dl, DAG, Subtarget),
11769 N2: NanRes);
11770 }
11771
11772 unsigned NativeMask = 0;
11773 if ((Mask & fcNan) == fcNan)
11774 NativeMask |= DC_NAN;
11775 if (Mask & fcNegInf)
11776 NativeMask |= DC_NEG_INF;
11777 if (Mask & fcPosInf)
11778 NativeMask |= DC_POS_INF;
11779 if (Mask & fcNegZero)
11780 NativeMask |= DC_NEG_ZERO;
11781 if (Mask & fcPosZero)
11782 NativeMask |= DC_POS_ZERO;
11783 if (Mask & fcNegSubnormal)
11784 NativeMask |= DC_NEG_SUBNORM;
11785 if (Mask & fcPosSubnormal)
11786 NativeMask |= DC_POS_SUBNORM;
11787 return SDValue(
11788 DAG.getMachineNode(
11789 Opcode: TargetOpcode::EXTRACT_SUBREG, dl: Dl, VT: MVT::i1,
11790 Op1: SDValue(DAG.getMachineNode(
11791 Opcode: TestOp, dl: Dl, VT: MVT::i32,
11792 Op1: DAG.getTargetConstant(Val: NativeMask, DL: Dl, VT: MVT::i32), Op2: Op),
11793 0),
11794 Op2: DAG.getTargetConstant(Val: PPC::sub_eq, DL: Dl, VT: MVT::i32)),
11795 0);
11796}
11797
11798SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
11799 SelectionDAG &DAG) const {
11800 assert(Subtarget.hasP9Vector() && "Test data class requires Power9");
11801 SDValue LHS = Op.getOperand(i: 0);
11802 uint64_t RHSC = Op.getConstantOperandVal(i: 1);
11803 SDLoc Dl(Op);
11804 FPClassTest Category = static_cast<FPClassTest>(RHSC);
11805 if (LHS.getValueType() == MVT::ppcf128) {
11806 // The higher part determines the value class.
11807 LHS = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: Dl, VT: MVT::f64, N1: LHS,
11808 N2: DAG.getConstant(Val: 1, DL: Dl, VT: MVT::i32));
11809 }
11810
11811 return getDataClassTest(Op: LHS, Mask: Category, Dl, DAG, Subtarget);
11812}
11813
11814// Adjust the length value for a load/store with length to account for the
11815// instructions requiring a left justified length, and for non-byte element
11816// types requiring scaling by element size.
11817static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left,
11818 SelectionDAG &DAG) {
11819 SDLoc dl(Val);
11820 EVT VT = Val->getValueType(ResNo: 0);
11821 unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0;
11822 unsigned TypeAdj = llvm::countr_zero<uint32_t>(Val: Bits / 8);
11823 SDValue SHLAmt = DAG.getConstant(Val: LeftAdj + TypeAdj, DL: dl, VT);
11824 return DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: Val, N2: SHLAmt);
11825}
11826
11827SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const {
11828 auto VPLD = cast<VPLoadSDNode>(Val&: Op);
11829 bool Future = Subtarget.isISAFuture();
11830 SDLoc dl(Op);
11831 assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) &&
11832 "Mask predication not supported");
11833 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11834 SDValue Len = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: PtrVT, Operand: VPLD->getOperand(Num: 4));
11835 unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl;
11836 unsigned EltBits = Op->getValueType(ResNo: 0).getScalarType().getSizeInBits();
11837 Len = AdjustLength(Val: Len, Bits: EltBits, Left: !Future, DAG);
11838 SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32),
11839 VPLD->getOperand(Num: 1), Len};
11840 SDVTList Tys = DAG.getVTList(VT1: Op->getValueType(ResNo: 0), VT2: MVT::Other);
11841 SDValue VPL =
11842 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl, VTList: Tys, Ops,
11843 MemVT: VPLD->getMemoryVT(), MMO: VPLD->getMemOperand());
11844 return VPL;
11845}
11846
11847SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const {
11848 auto VPST = cast<VPStoreSDNode>(Val&: Op);
11849 assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) &&
11850 "Mask predication not supported");
11851 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11852 SDLoc dl(Op);
11853 SDValue Len = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: PtrVT, Operand: VPST->getOperand(Num: 5));
11854 unsigned EltBits =
11855 Op->getOperand(Num: 1).getValueType().getScalarType().getSizeInBits();
11856 bool Future = Subtarget.isISAFuture();
11857 unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl;
11858 Len = AdjustLength(Val: Len, Bits: EltBits, Left: !Future, DAG);
11859 SDValue Ops[] = {
11860 VPST->getChain(), DAG.getConstant(Val: IID, DL: dl, VT: MVT::i32),
11861 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v4i32, Operand: VPST->getOperand(Num: 1)),
11862 VPST->getOperand(Num: 2), Len};
11863 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
11864 SDValue VPS =
11865 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops,
11866 MemVT: VPST->getMemoryVT(), MMO: VPST->getMemOperand());
11867 return VPS;
11868}
11869
11870SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
11871 SelectionDAG &DAG) const {
11872 SDLoc dl(Op);
11873
11874 MachineFunction &MF = DAG.getMachineFunction();
11875 SDValue Op0 = Op.getOperand(i: 0);
11876 EVT ValVT = Op0.getValueType();
11877 unsigned EltSize = Op.getValueType().getScalarSizeInBits();
11878 if (isa<ConstantSDNode>(Val: Op0) && EltSize <= 32) {
11879 int64_t IntVal = Op.getConstantOperandVal(i: 0);
11880 if (IntVal >= -16 && IntVal <= 15)
11881 return getCanonicalConstSplat(Val: IntVal, SplatSize: EltSize / 8, VT: Op.getValueType(), DAG,
11882 dl);
11883 }
11884
11885 ReuseLoadInfo RLI;
11886 if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() &&
11887 Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD &&
11888 Op0.getValueType() == MVT::i32 && Op0.hasOneUse() &&
11889 canReuseLoadAddress(Op: Op0, MemVT: MVT::i32, RLI, DAG, ET: ISD::NON_EXTLOAD)) {
11890
11891 MachineMemOperand *MMO =
11892 MF.getMachineMemOperand(PtrInfo: RLI.MPI, F: MachineMemOperand::MOLoad, Size: 4,
11893 BaseAlignment: RLI.Alignment, AAInfo: RLI.AAInfo, Ranges: RLI.Ranges);
11894 SDValue Ops[] = {RLI.Chain, RLI.Ptr, DAG.getValueType(Op.getValueType())};
11895 SDValue Bits = DAG.getMemIntrinsicNode(
11896 Opcode: PPCISD::LD_SPLAT, dl, VTList: DAG.getVTList(VT1: MVT::v4i32, VT2: MVT::Other), Ops,
11897 MemVT: MVT::i32, MMO);
11898 if (RLI.ResChain)
11899 DAG.makeEquivalentMemoryOrdering(OldChain: RLI.ResChain, NewMemOpChain: Bits.getValue(R: 1));
11900 return Bits.getValue(R: 0);
11901 }
11902
11903 // Create a stack slot that is 16-byte aligned.
11904 MachineFrameInfo &MFI = MF.getFrameInfo();
11905 int FrameIdx = MFI.CreateStackObject(Size: 16, Alignment: Align(16), isSpillSlot: false);
11906 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
11907 SDValue FIdx = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
11908
11909 SDValue Val = Op0;
11910 // P10 hardware store forwarding requires that a single store contains all
11911 // the data for the load. P10 is able to merge a pair of adjacent stores. Try
11912 // to avoid load hit store on P10 when running binaries compiled for older
11913 // processors by generating two mergeable scalar stores to forward with the
11914 // vector load.
11915 if (!DisableP10StoreForward && Subtarget.isPPC64() &&
11916 !Subtarget.isLittleEndian() && ValVT.isInteger() &&
11917 ValVT.getSizeInBits() <= 64) {
11918 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i64, Operand: Val);
11919 EVT ShiftAmountTy = getShiftAmountTy(LHSTy: MVT::i64, DL: DAG.getDataLayout());
11920 SDValue ShiftBy = DAG.getConstant(
11921 Val: 64 - Op.getValueType().getScalarSizeInBits(), DL: dl, VT: ShiftAmountTy);
11922 Val = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: Val, N2: ShiftBy);
11923 SDValue Plus8 =
11924 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FIdx, N2: DAG.getConstant(Val: 8, DL: dl, VT: PtrVT));
11925 SDValue Store2 =
11926 DAG.getStore(Chain: DAG.getEntryNode(), dl, Val, Ptr: Plus8, PtrInfo: MachinePointerInfo());
11927 SDValue Store = DAG.getStore(Chain: Store2, dl, Val, Ptr: FIdx, PtrInfo: MachinePointerInfo());
11928 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: Store, Ptr: FIdx,
11929 PtrInfo: MachinePointerInfo());
11930 }
11931
11932 // Store the input value into Value#0 of the stack slot.
11933 SDValue Store =
11934 DAG.getStore(Chain: DAG.getEntryNode(), dl, Val, Ptr: FIdx, PtrInfo: MachinePointerInfo());
11935 // Load it out.
11936 return DAG.getLoad(VT: Op.getValueType(), dl, Chain: Store, Ptr: FIdx, PtrInfo: MachinePointerInfo());
11937}
11938
11939SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
11940 SelectionDAG &DAG) const {
11941 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
11942 "Should only be called for ISD::INSERT_VECTOR_ELT");
11943
11944 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
11945
11946 EVT VT = Op.getValueType();
11947 SDLoc dl(Op);
11948 SDValue V1 = Op.getOperand(i: 0);
11949 SDValue V2 = Op.getOperand(i: 1);
11950
11951 if (VT == MVT::v2f64 && C)
11952 return Op;
11953
11954 if (Subtarget.hasP9Vector()) {
11955 // A f32 load feeding into a v4f32 insert_vector_elt is handled in this way
11956 // because on P10, it allows this specific insert_vector_elt load pattern to
11957 // utilize the refactored load and store infrastructure in order to exploit
11958 // prefixed loads.
11959 // On targets with inexpensive direct moves (Power9 and up), a
11960 // (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer
11961 // load since a single precision load will involve conversion to double
11962 // precision on the load followed by another conversion to single precision.
11963 if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&
11964 (isa<LoadSDNode>(Val: V2))) {
11965 SDValue BitcastVector = DAG.getBitcast(VT: MVT::v4i32, V: V1);
11966 SDValue BitcastLoad = DAG.getBitcast(VT: MVT::i32, V: V2);
11967 SDValue InsVecElt =
11968 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: MVT::v4i32, N1: BitcastVector,
11969 N2: BitcastLoad, N3: Op.getOperand(i: 2));
11970 return DAG.getBitcast(VT: MVT::v4f32, V: InsVecElt);
11971 }
11972 }
11973
11974 if (Subtarget.isISA3_1()) {
11975 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
11976 return SDValue();
11977 // On P10, we have legal lowering for constant and variable indices for
11978 // all vectors.
11979 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
11980 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
11981 return Op;
11982 }
11983
11984 // Before P10, we have legal lowering for constant indices but not for
11985 // variable ones.
11986 if (!C)
11987 return SDValue();
11988
11989 // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
11990 if (VT == MVT::v8i16 || VT == MVT::v16i8) {
11991 SDValue Mtvsrz = DAG.getNode(Opcode: PPCISD::MTVSRZ, DL: dl, VT, Operand: V2);
11992 unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
11993 unsigned InsertAtElement = C->getZExtValue();
11994 unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
11995 if (Subtarget.isLittleEndian()) {
11996 InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
11997 }
11998 return DAG.getNode(Opcode: PPCISD::VECINSERT, DL: dl, VT, N1: V1, N2: Mtvsrz,
11999 N3: DAG.getConstant(Val: InsertAtByte, DL: dl, VT: MVT::i32));
12000 }
12001 return Op;
12002}
12003
12004SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
12005 SelectionDAG &DAG) const {
12006 SDLoc dl(Op);
12007 LoadSDNode *LN = cast<LoadSDNode>(Val: Op.getNode());
12008 SDValue LoadChain = LN->getChain();
12009 SDValue BasePtr = LN->getBasePtr();
12010 EVT VT = Op.getValueType();
12011 bool IsV1024i1 = VT == MVT::v1024i1;
12012 bool IsV2048i1 = VT == MVT::v2048i1;
12013
12014 // The types v1024i1 and v2048i1 are used for Dense Math dmr registers and
12015 // Dense Math dmr pair registers, respectively.
12016 assert((IsV1024i1 || IsV2048i1) && "Unsupported type.");
12017 (void)IsV2048i1;
12018 assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
12019 "Dense Math support required.");
12020 assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
12021
12022 SmallVector<SDValue, 8> Loads;
12023 SmallVector<SDValue, 8> LoadChains;
12024
12025 SDValue IntrinID = DAG.getConstant(Val: Intrinsic::ppc_vsx_lxvp, DL: dl, VT: MVT::i32);
12026 SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr};
12027 MachineMemOperand *MMO = LN->getMemOperand();
12028 unsigned NumVecs = VT.getSizeInBits() / 256;
12029 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12030 MachineMemOperand *NewMMO =
12031 DAG.getMachineFunction().getMachineMemOperand(MMO, Offset: Idx * 32, Size: 32);
12032 if (Idx > 0) {
12033 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12034 N2: DAG.getConstant(Val: 32, DL: dl, VT: BasePtr.getValueType()));
12035 LoadOps[2] = BasePtr;
12036 }
12037 SDValue Ld = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
12038 VTList: DAG.getVTList(VT1: MVT::v256i1, VT2: MVT::Other),
12039 Ops: LoadOps, MemVT: MVT::v256i1, MMO: NewMMO);
12040 LoadChains.push_back(Elt: Ld.getValue(R: 1));
12041 Loads.push_back(Elt: Ld);
12042 }
12043
12044 if (Subtarget.isLittleEndian()) {
12045 std::reverse(first: Loads.begin(), last: Loads.end());
12046 std::reverse(first: LoadChains.begin(), last: LoadChains.end());
12047 }
12048
12049 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: LoadChains);
12050 SDValue Lo =
12051 DAG.getNode(Opcode: PPCISD::INST512, DL: dl, VT: MVT::v512i1, N1: Loads[0], N2: Loads[1]);
12052 SDValue LoSub = DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32);
12053 SDValue Hi =
12054 DAG.getNode(Opcode: PPCISD::INST512HI, DL: dl, VT: MVT::v512i1, N1: Loads[2], N2: Loads[3]);
12055 SDValue HiSub = DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32);
12056 SDValue RC = DAG.getTargetConstant(Val: PPC::DMRRCRegClassID, DL: dl, VT: MVT::i32);
12057 const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
12058
12059 SDValue Value =
12060 SDValue(DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v1024i1, Ops), 0);
12061
12062 if (IsV1024i1) {
12063 return DAG.getMergeValues(Ops: {Value, TF}, dl);
12064 }
12065
12066 // Handle Loads for V2048i1 which represents a dmr pair.
12067 SDValue DmrPValue;
12068 SDValue Dmr1Lo =
12069 DAG.getNode(Opcode: PPCISD::INST512, DL: dl, VT: MVT::v512i1, N1: Loads[4], N2: Loads[5]);
12070 SDValue Dmr1Hi =
12071 DAG.getNode(Opcode: PPCISD::INST512HI, DL: dl, VT: MVT::v512i1, N1: Loads[6], N2: Loads[7]);
12072 const SDValue Dmr1Ops[] = {RC, Dmr1Lo, LoSub, Dmr1Hi, HiSub};
12073 SDValue Dmr1Value = SDValue(
12074 DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v1024i1, Ops: Dmr1Ops), 0);
12075
12076 SDValue Dmr0Sub = DAG.getTargetConstant(Val: PPC::sub_dmr0, DL: dl, VT: MVT::i32);
12077 SDValue Dmr1Sub = DAG.getTargetConstant(Val: PPC::sub_dmr1, DL: dl, VT: MVT::i32);
12078
12079 SDValue DmrPRC = DAG.getTargetConstant(Val: PPC::DMRpRCRegClassID, DL: dl, VT: MVT::i32);
12080 const SDValue DmrPOps[] = {DmrPRC, Value, Dmr0Sub, Dmr1Value, Dmr1Sub};
12081
12082 DmrPValue = SDValue(
12083 DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v2048i1, Ops: DmrPOps), 0);
12084
12085 return DAG.getMergeValues(Ops: {DmrPValue, TF}, dl);
12086}
12087
12088SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
12089 const SDLoc &dl,
12090 SelectionDAG &DAG) const {
12091 SDValue Lo =
12092 DAG.getNode(Opcode: PPCISD::INST512, DL: dl, VT: MVT::v512i1, N1: Pairs[0], N2: Pairs[1]);
12093 SDValue LoSub = DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32);
12094 SDValue Hi =
12095 DAG.getNode(Opcode: PPCISD::INST512HI, DL: dl, VT: MVT::v512i1, N1: Pairs[2], N2: Pairs[3]);
12096 SDValue HiSub = DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32);
12097 SDValue RC = DAG.getTargetConstant(Val: PPC::DMRRCRegClassID, DL: dl, VT: MVT::i32);
12098
12099 return SDValue(DAG.getMachineNode(Opcode: PPC::REG_SEQUENCE, dl, VT: MVT::v1024i1,
12100 Ops: {RC, Lo, LoSub, Hi, HiSub}),
12101 0);
12102}
12103
12104SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
12105 SelectionDAG &DAG) const {
12106 SDLoc dl(Op);
12107 LoadSDNode *LN = cast<LoadSDNode>(Val: Op.getNode());
12108 SDValue LoadChain = LN->getChain();
12109 SDValue BasePtr = LN->getBasePtr();
12110 EVT VT = Op.getValueType();
12111
12112 if (VT == MVT::v1024i1 || VT == MVT::v2048i1)
12113 return LowerDMFVectorLoad(Op, DAG);
12114
12115 if (VT != MVT::v256i1 && VT != MVT::v512i1)
12116 return Op;
12117
12118 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
12119 // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
12120 // 2 or 4 vsx registers.
12121 assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
12122 "Type unsupported without MMA");
12123 assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
12124 "Type unsupported without paired vector support");
12125 Align Alignment = LN->getAlign();
12126 SmallVector<SDValue, 4> Loads;
12127 SmallVector<SDValue, 4> LoadChains;
12128 unsigned NumVecs = VT.getSizeInBits() / 128;
12129 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12130 SDValue Load =
12131 DAG.getLoad(VT: MVT::v16i8, dl, Chain: LoadChain, Ptr: BasePtr,
12132 PtrInfo: LN->getPointerInfo().getWithOffset(O: Idx * 16),
12133 Alignment: commonAlignment(A: Alignment, Offset: Idx * 16),
12134 MMOFlags: LN->getMemOperand()->getFlags(), AAInfo: LN->getAAInfo());
12135 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12136 N2: DAG.getConstant(Val: 16, DL: dl, VT: BasePtr.getValueType()));
12137 Loads.push_back(Elt: Load);
12138 LoadChains.push_back(Elt: Load.getValue(R: 1));
12139 }
12140 if (Subtarget.isLittleEndian()) {
12141 std::reverse(first: Loads.begin(), last: Loads.end());
12142 std::reverse(first: LoadChains.begin(), last: LoadChains.end());
12143 }
12144 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: LoadChains);
12145 SDValue Value =
12146 DAG.getNode(Opcode: VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
12147 DL: dl, VT, Ops: Loads);
12148 SDValue RetOps[] = {Value, TF};
12149 return DAG.getMergeValues(Ops: RetOps, dl);
12150}
12151
12152SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op,
12153 SelectionDAG &DAG) const {
12154
12155 SDLoc dl(Op);
12156 StoreSDNode *SN = cast<StoreSDNode>(Val: Op.getNode());
12157 SDValue StoreChain = SN->getChain();
12158 SDValue BasePtr = SN->getBasePtr();
12159 SmallVector<SDValue, 8> Values;
12160 SmallVector<SDValue, 8> Stores;
12161 EVT VT = SN->getValue().getValueType();
12162 bool IsV1024i1 = VT == MVT::v1024i1;
12163 bool IsV2048i1 = VT == MVT::v2048i1;
12164
12165 // The types v1024i1 and v2048i1 are used for Dense Math dmr registers and
12166 // Dense Math dmr pair registers, respectively.
12167 assert((IsV1024i1 || IsV2048i1) && "Unsupported type.");
12168 (void)IsV2048i1;
12169 assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
12170 "Dense Math support required.");
12171 assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
12172
12173 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
12174 if (IsV1024i1) {
12175 SDValue Lo(DAG.getMachineNode(
12176 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
12177 Op1: Op.getOperand(i: 1),
12178 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12179 0);
12180 SDValue Hi(DAG.getMachineNode(
12181 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1,
12182 Op1: Op.getOperand(i: 1),
12183 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12184 0);
12185 MachineSDNode *ExtNode =
12186 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Lo);
12187 Values.push_back(Elt: SDValue(ExtNode, 0));
12188 Values.push_back(Elt: SDValue(ExtNode, 1));
12189 ExtNode = DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Hi);
12190 Values.push_back(Elt: SDValue(ExtNode, 0));
12191 Values.push_back(Elt: SDValue(ExtNode, 1));
12192 } else {
12193 // This corresponds to v2048i1 which represents a dmr pair.
12194 SDValue Dmr0(
12195 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v1024i1,
12196 Op1: Op.getOperand(i: 1),
12197 Op2: DAG.getTargetConstant(Val: PPC::sub_dmr0, DL: dl, VT: MVT::i32)),
12198 0);
12199
12200 SDValue Dmr1(
12201 DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v1024i1,
12202 Op1: Op.getOperand(i: 1),
12203 Op2: DAG.getTargetConstant(Val: PPC::sub_dmr1, DL: dl, VT: MVT::i32)),
12204 0);
12205
12206 SDValue Dmr0Lo(DAG.getMachineNode(
12207 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr0,
12208 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12209 0);
12210
12211 SDValue Dmr0Hi(DAG.getMachineNode(
12212 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr0,
12213 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12214 0);
12215
12216 SDValue Dmr1Lo(DAG.getMachineNode(
12217 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr1,
12218 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_lo, DL: dl, VT: MVT::i32)),
12219 0);
12220
12221 SDValue Dmr1Hi(DAG.getMachineNode(
12222 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::v512i1, Op1: Dmr1,
12223 Op2: DAG.getTargetConstant(Val: PPC::sub_wacc_hi, DL: dl, VT: MVT::i32)),
12224 0);
12225
12226 MachineSDNode *ExtNode =
12227 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Dmr0Lo);
12228 Values.push_back(Elt: SDValue(ExtNode, 0));
12229 Values.push_back(Elt: SDValue(ExtNode, 1));
12230 ExtNode =
12231 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Dmr0Hi);
12232 Values.push_back(Elt: SDValue(ExtNode, 0));
12233 Values.push_back(Elt: SDValue(ExtNode, 1));
12234 ExtNode = DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Dmr1Lo);
12235 Values.push_back(Elt: SDValue(ExtNode, 0));
12236 Values.push_back(Elt: SDValue(ExtNode, 1));
12237 ExtNode =
12238 DAG.getMachineNode(Opcode: PPC::DMXXEXTFDMR512_HI, dl, ResultTys: ReturnTypes, Ops: Dmr1Hi);
12239 Values.push_back(Elt: SDValue(ExtNode, 0));
12240 Values.push_back(Elt: SDValue(ExtNode, 1));
12241 }
12242
12243 if (Subtarget.isLittleEndian())
12244 std::reverse(first: Values.begin(), last: Values.end());
12245
12246 SDVTList Tys = DAG.getVTList(VT: MVT::Other);
12247 SmallVector<SDValue, 4> Ops{
12248 StoreChain, DAG.getConstant(Val: Intrinsic::ppc_vsx_stxvp, DL: dl, VT: MVT::i32),
12249 Values[0], BasePtr};
12250 MachineMemOperand *MMO = SN->getMemOperand();
12251 unsigned NumVecs = VT.getSizeInBits() / 256;
12252 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12253 MachineMemOperand *NewMMO =
12254 DAG.getMachineFunction().getMachineMemOperand(MMO, Offset: Idx * 32, Size: 32);
12255 if (Idx > 0) {
12256 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12257 N2: DAG.getConstant(Val: 32, DL: dl, VT: BasePtr.getValueType()));
12258 Ops[3] = BasePtr;
12259 }
12260 Ops[2] = Values[Idx];
12261 SDValue St = DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl, VTList: Tys, Ops,
12262 MemVT: MVT::v256i1, MMO: NewMMO);
12263 Stores.push_back(Elt: St);
12264 }
12265
12266 SDValue TF = DAG.getTokenFactor(DL: dl, Vals&: Stores);
12267 return TF;
12268}
12269
12270SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
12271 SelectionDAG &DAG) const {
12272 SDLoc dl(Op);
12273 StoreSDNode *SN = cast<StoreSDNode>(Val: Op.getNode());
12274 SDValue StoreChain = SN->getChain();
12275 SDValue BasePtr = SN->getBasePtr();
12276 SDValue Value = SN->getValue();
12277 SDValue Value2 = SN->getValue();
12278 EVT StoreVT = Value.getValueType();
12279
12280 if (StoreVT == MVT::v1024i1 || StoreVT == MVT::v2048i1)
12281 return LowerDMFVectorStore(Op, DAG);
12282
12283 if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
12284 return Op;
12285
12286 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
12287 // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
12288 // underlying registers individually.
12289 assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
12290 "Type unsupported without MMA");
12291 assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
12292 "Type unsupported without paired vector support");
12293 Align Alignment = SN->getAlign();
12294 SmallVector<SDValue, 4> Stores;
12295 unsigned NumVecs = 2;
12296 if (StoreVT == MVT::v512i1) {
12297 if (Subtarget.isISAFuture()) {
12298 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
12299 MachineSDNode *ExtNode = DAG.getMachineNode(
12300 Opcode: PPC::DMXXEXTFDMR512, dl, ResultTys: ReturnTypes, Ops: Op.getOperand(i: 1));
12301
12302 Value = SDValue(ExtNode, 0);
12303 Value2 = SDValue(ExtNode, 1);
12304 } else
12305 Value = DAG.getNode(Opcode: PPCISD::XXMFACC, DL: dl, VT: MVT::v512i1, Operand: Value);
12306 NumVecs = 4;
12307 }
12308 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
12309 unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
12310 SDValue Elt;
12311 if (Subtarget.isISAFuture()) {
12312 VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
12313 Elt = DAG.getNode(Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8,
12314 N1: Idx > 1 ? Value2 : Value,
12315 N2: DAG.getConstant(Val: VecNum, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
12316 } else
12317 Elt = DAG.getNode(Opcode: PPCISD::EXTRACT_VSX_REG, DL: dl, VT: MVT::v16i8, N1: Value,
12318 N2: DAG.getConstant(Val: VecNum, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout())));
12319
12320 SDValue Store =
12321 DAG.getStore(Chain: StoreChain, dl, Val: Elt, Ptr: BasePtr,
12322 PtrInfo: SN->getPointerInfo().getWithOffset(O: Idx * 16),
12323 Alignment: commonAlignment(A: Alignment, Offset: Idx * 16),
12324 MMOFlags: SN->getMemOperand()->getFlags(), AAInfo: SN->getAAInfo());
12325 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
12326 N2: DAG.getConstant(Val: 16, DL: dl, VT: BasePtr.getValueType()));
12327 Stores.push_back(Elt: Store);
12328 }
12329 SDValue TF = DAG.getTokenFactor(DL: dl, Vals&: Stores);
12330 return TF;
12331}
12332
12333SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
12334 SDLoc dl(Op);
12335 if (Op.getValueType() == MVT::v4i32) {
12336 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
12337
12338 SDValue Zero = getCanonicalConstSplat(Val: 0, SplatSize: 1, VT: MVT::v4i32, DAG, dl);
12339 // +16 as shift amt.
12340 SDValue Neg16 = getCanonicalConstSplat(Val: -16, SplatSize: 4, VT: MVT::v4i32, DAG, dl);
12341 SDValue RHSSwap = // = vrlw RHS, 16
12342 BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vrlw, LHS: RHS, RHS: Neg16, DAG, dl);
12343
12344 // Shrinkify inputs to v8i16.
12345 LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: LHS);
12346 RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: RHS);
12347 RHSSwap = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v8i16, Operand: RHSSwap);
12348
12349 // Low parts multiplied together, generating 32-bit results (we ignore the
12350 // top parts).
12351 SDValue LoProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmulouh,
12352 LHS, RHS, DAG, dl, DestVT: MVT::v4i32);
12353
12354 SDValue HiProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmsumuhm,
12355 Op0: LHS, Op1: RHSSwap, Op2: Zero, DAG, dl, DestVT: MVT::v4i32);
12356 // Shift the high parts up 16 bits.
12357 HiProd = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vslw, LHS: HiProd,
12358 RHS: Neg16, DAG, dl);
12359 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::v4i32, N1: LoProd, N2: HiProd);
12360 } else if (Op.getValueType() == MVT::v16i8) {
12361 SDValue LHS = Op.getOperand(i: 0), RHS = Op.getOperand(i: 1);
12362 bool isLittleEndian = Subtarget.isLittleEndian();
12363
12364 // Multiply the even 8-bit parts, producing 16-bit sums.
12365 SDValue EvenParts = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmuleub,
12366 LHS, RHS, DAG, dl, DestVT: MVT::v8i16);
12367 EvenParts = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: EvenParts);
12368
12369 // Multiply the odd 8-bit parts, producing 16-bit sums.
12370 SDValue OddParts = BuildIntrinsicOp(IID: Intrinsic::ppc_altivec_vmuloub,
12371 LHS, RHS, DAG, dl, DestVT: MVT::v8i16);
12372 OddParts = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v16i8, Operand: OddParts);
12373
12374 // Merge the results together. Because vmuleub and vmuloub are
12375 // instructions with a big-endian bias, we must reverse the
12376 // element numbering and reverse the meaning of "odd" and "even"
12377 // when generating little endian code.
12378 int Ops[16];
12379 for (unsigned i = 0; i != 8; ++i) {
12380 if (isLittleEndian) {
12381 Ops[i*2 ] = 2*i;
12382 Ops[i*2+1] = 2*i+16;
12383 } else {
12384 Ops[i*2 ] = 2*i+1;
12385 Ops[i*2+1] = 2*i+1+16;
12386 }
12387 }
12388 if (isLittleEndian)
12389 return DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: OddParts, N2: EvenParts, Mask: Ops);
12390 else
12391 return DAG.getVectorShuffle(VT: MVT::v16i8, dl, N1: EvenParts, N2: OddParts, Mask: Ops);
12392 } else {
12393 llvm_unreachable("Unknown mul to lower!");
12394 }
12395}
12396
12397SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
12398 bool IsStrict = Op->isStrictFPOpcode();
12399 if (Op.getOperand(i: IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
12400 !Subtarget.hasP9Vector())
12401 return SDValue();
12402
12403 return Op;
12404}
12405
12406// Custom lowering for fpext vf32 to v2f64
12407SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
12408
12409 assert(Op.getOpcode() == ISD::FP_EXTEND &&
12410 "Should only be called for ISD::FP_EXTEND");
12411
12412 // FIXME: handle extends from half precision float vectors on P9.
12413 // We only want to custom lower an extend from v2f32 to v2f64.
12414 if (Op.getValueType() != MVT::v2f64 ||
12415 Op.getOperand(i: 0).getValueType() != MVT::v2f32)
12416 return SDValue();
12417
12418 SDLoc dl(Op);
12419 SDValue Op0 = Op.getOperand(i: 0);
12420
12421 switch (Op0.getOpcode()) {
12422 default:
12423 return SDValue();
12424 case ISD::EXTRACT_SUBVECTOR: {
12425 assert(Op0.getNumOperands() == 2 &&
12426 isa<ConstantSDNode>(Op0->getOperand(1)) &&
12427 "Node should have 2 operands with second one being a constant!");
12428
12429 if (Op0.getOperand(i: 0).getValueType() != MVT::v4f32)
12430 return SDValue();
12431
12432 // Custom lower is only done for high or low doubleword.
12433 int Idx = Op0.getConstantOperandVal(i: 1);
12434 if (Idx % 2 != 0)
12435 return SDValue();
12436
12437 // Since input is v4f32, at this point Idx is either 0 or 2.
12438 // Shift to get the doubleword position we want.
12439 int DWord = Idx >> 1;
12440
12441 // High and low word positions are different on little endian.
12442 if (Subtarget.isLittleEndian())
12443 DWord ^= 0x1;
12444
12445 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64,
12446 N1: Op0.getOperand(i: 0), N2: DAG.getConstant(Val: DWord, DL: dl, VT: MVT::i32));
12447 }
12448 case ISD::FADD:
12449 case ISD::FMUL:
12450 case ISD::FSUB: {
12451 SDValue NewLoad[2];
12452 for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
12453 // Ensure both input are loads.
12454 SDValue LdOp = Op0.getOperand(i);
12455 if (LdOp.getOpcode() != ISD::LOAD)
12456 return SDValue();
12457 // Generate new load node.
12458 LoadSDNode *LD = cast<LoadSDNode>(Val&: LdOp);
12459 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
12460 NewLoad[i] = DAG.getMemIntrinsicNode(
12461 Opcode: PPCISD::LD_VSX_LH, dl, VTList: DAG.getVTList(VT1: MVT::v4f32, VT2: MVT::Other), Ops: LoadOps,
12462 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
12463 }
12464 SDValue NewOp =
12465 DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc(Op0), VT: MVT::v4f32, N1: NewLoad[0],
12466 N2: NewLoad[1], Flags: Op0.getNode()->getFlags());
12467 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64, N1: NewOp,
12468 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
12469 }
12470 case ISD::LOAD: {
12471 LoadSDNode *LD = cast<LoadSDNode>(Val&: Op0);
12472 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
12473 SDValue NewLd = DAG.getMemIntrinsicNode(
12474 Opcode: PPCISD::LD_VSX_LH, dl, VTList: DAG.getVTList(VT1: MVT::v4f32, VT2: MVT::Other), Ops: LoadOps,
12475 MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
12476 return DAG.getNode(Opcode: PPCISD::FP_EXTEND_HALF, DL: dl, VT: MVT::v2f64, N1: NewLd,
12477 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
12478 }
12479 }
12480 llvm_unreachable("ERROR:Should return for all cases within swtich.");
12481}
12482
12483static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value,
12484 SelectionDAG &DAG,
12485 const PPCSubtarget &STI) {
12486 SDLoc DL(Value);
12487 if (STI.useCRBits())
12488 Value = DAG.getNode(Opcode: ISD::SELECT, DL, VT: SumType, N1: Value,
12489 N2: DAG.getConstant(Val: 1, DL, VT: SumType),
12490 N3: DAG.getConstant(Val: 0, DL, VT: SumType));
12491 else
12492 Value = DAG.getZExtOrTrunc(Op: Value, DL, VT: SumType);
12493 SDValue Sum = DAG.getNode(Opcode: PPCISD::ADDC, DL, VTList: DAG.getVTList(VT1: SumType, VT2: MVT::i32),
12494 N1: Value, N2: DAG.getAllOnesConstant(DL, VT: SumType));
12495 return Sum.getValue(R: 1);
12496}
12497
12498static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag,
12499 EVT CarryType, SelectionDAG &DAG,
12500 const PPCSubtarget &STI) {
12501 SDLoc DL(Flag);
12502 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: SumType);
12503 SDValue Carry = DAG.getNode(
12504 Opcode: PPCISD::ADDE, DL, VTList: DAG.getVTList(VT1: SumType, VT2: MVT::i32), N1: Zero, N2: Zero, N3: Flag);
12505 if (STI.useCRBits())
12506 return DAG.getSetCC(DL, VT: CarryType, LHS: Carry, RHS: Zero, Cond: ISD::SETNE);
12507 return DAG.getZExtOrTrunc(Op: Carry, DL, VT: CarryType);
12508}
12509
12510SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const {
12511
12512 SDLoc DL(Op);
12513 SDNode *N = Op.getNode();
12514 EVT VT = N->getValueType(ResNo: 0);
12515 EVT CarryType = N->getValueType(ResNo: 1);
12516 unsigned Opc = N->getOpcode();
12517 bool IsAdd = Opc == ISD::UADDO;
12518 Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC;
12519 SDValue Sum = DAG.getNode(Opcode: Opc, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
12520 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
12521 SDValue Carry = ConvertCarryFlagToCarryValue(SumType: VT, Flag: Sum.getValue(R: 1), CarryType,
12522 DAG, STI: Subtarget);
12523 if (!IsAdd)
12524 Carry = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryType, N1: Carry,
12525 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryType));
12526 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Sum, N2: Carry);
12527}
12528
12529SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op,
12530 SelectionDAG &DAG) const {
12531 SDLoc DL(Op);
12532 SDNode *N = Op.getNode();
12533 unsigned Opc = N->getOpcode();
12534 EVT VT = N->getValueType(ResNo: 0);
12535 EVT CarryType = N->getValueType(ResNo: 1);
12536 SDValue CarryOp = N->getOperand(Num: 2);
12537 bool IsAdd = Opc == ISD::UADDO_CARRY;
12538 Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE;
12539 if (!IsAdd)
12540 CarryOp = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryOp.getValueType(), N1: CarryOp,
12541 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryOp.getValueType()));
12542 CarryOp = ConvertCarryValueToCarryFlag(SumType: VT, Value: CarryOp, DAG, STI: Subtarget);
12543 SDValue Sum = DAG.getNode(Opcode: Opc, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
12544 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: CarryOp);
12545 CarryOp = ConvertCarryFlagToCarryValue(SumType: VT, Flag: Sum.getValue(R: 1), CarryType, DAG,
12546 STI: Subtarget);
12547 if (!IsAdd)
12548 CarryOp = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryOp.getValueType(), N1: CarryOp,
12549 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryOp.getValueType()));
12550 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Sum, N2: CarryOp);
12551}
12552
12553SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const {
12554
12555 SDLoc dl(Op);
12556 SDValue LHS = Op.getOperand(i: 0);
12557 SDValue RHS = Op.getOperand(i: 1);
12558 EVT VT = Op.getNode()->getValueType(ResNo: 0);
12559
12560 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LHS, N2: RHS);
12561
12562 SDValue Xor1 = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: RHS, N2: LHS);
12563 SDValue Xor2 = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: Sub, N2: LHS);
12564
12565 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: Xor1, N2: Xor2);
12566
12567 SDValue Overflow =
12568 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: And,
12569 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT: MVT::i32));
12570
12571 SDValue OverflowTrunc =
12572 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: Op.getNode()->getValueType(ResNo: 1), Operand: Overflow);
12573
12574 return DAG.getMergeValues(Ops: {Sub, OverflowTrunc}, dl);
12575}
12576
12577/// Implements signed add with overflow detection using the rule:
12578/// (x eqv y) & (sum xor x), where the overflow bit is extracted from the sign
12579SDValue PPCTargetLowering::LowerSADDO(SDValue Op, SelectionDAG &DAG) const {
12580
12581 SDLoc dl(Op);
12582 SDValue LHS = Op.getOperand(i: 0);
12583 SDValue RHS = Op.getOperand(i: 1);
12584 EVT VT = Op.getNode()->getValueType(ResNo: 0);
12585
12586 SDValue Sum = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: LHS, N2: RHS);
12587
12588 // Compute ~(x xor y)
12589 SDValue XorXY = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: LHS, N2: RHS);
12590 SDValue EqvXY = DAG.getNOT(DL: dl, Val: XorXY, VT);
12591 // Compute (s xor x)
12592 SDValue SumXorX = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: Sum, N2: LHS);
12593
12594 // overflow = (x eqv y) & (s xor x)
12595 SDValue OverflowInSign = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: EqvXY, N2: SumXorX);
12596
12597 // Shift sign bit down to LSB
12598 SDValue Overflow =
12599 DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: OverflowInSign,
12600 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT: MVT::i32));
12601 // Truncate to the overflow type (i1)
12602 SDValue OverflowTrunc =
12603 DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: Op.getNode()->getValueType(ResNo: 1), Operand: Overflow);
12604
12605 return DAG.getMergeValues(Ops: {Sum, OverflowTrunc}, dl);
12606}
12607
12608// Lower unsigned 3-way compare producing -1/0/1.
12609SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
12610 SDLoc DL(Op);
12611 SDValue A = DAG.getFreeze(V: Op.getOperand(i: 0));
12612 SDValue B = DAG.getFreeze(V: Op.getOperand(i: 1));
12613 EVT OpVT = A.getValueType();
12614 EVT ResVT = Op.getValueType();
12615
12616 // On PPC64, i32 carries are affected by the upper 32 bits of the registers.
12617 // We must zero-extend to i64 to ensure the carry reflects the 32-bit unsigned
12618 // comparison.
12619 if (Subtarget.isPPC64() && OpVT == MVT::i32) {
12620 A = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: A);
12621 B = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: B);
12622 OpVT = MVT::i64;
12623 }
12624
12625 // First compute diff = A - B.
12626 SDValue Diff = DAG.getNode(Opcode: ISD::SUB, DL, VT: OpVT, N1: A, N2: B);
12627
12628 // Generate B - A using SUBC to capture carry.
12629 SDVTList VTs = DAG.getVTList(VT1: OpVT, VT2: MVT::i32);
12630 SDValue SubC = DAG.getNode(Opcode: PPCISD::SUBC, DL, VTList: VTs, N1: B, N2: A);
12631 SDValue CA0 = SubC.getValue(R: 1);
12632
12633 // t2 = A - B + CA0 using SUBE.
12634 SDValue SubE1 = DAG.getNode(Opcode: PPCISD::SUBE, DL, VTList: VTs, N1: A, N2: B, N3: CA0);
12635 SDValue CA1 = SubE1.getValue(R: 1);
12636
12637 // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1).
12638 SDValue ResPair = DAG.getNode(Opcode: PPCISD::SUBE, DL, VTList: VTs, N1: Diff, N2: SubE1, N3: CA1);
12639
12640 // Extract the first result and truncate to result type if needed.
12641 return DAG.getSExtOrTrunc(Op: ResPair.getValue(R: 0), DL, VT: ResVT);
12642}
12643
12644/// LowerOperation - Provide custom lowering hooks for some operations.
12645///
12646SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
12647 switch (Op.getOpcode()) {
12648 default:
12649 llvm_unreachable("Wasn't expecting to be able to lower this!");
12650 case ISD::FPOW: return lowerPow(Op, DAG);
12651 case ISD::FSIN: return lowerSin(Op, DAG);
12652 case ISD::FCOS: return lowerCos(Op, DAG);
12653 case ISD::FLOG: return lowerLog(Op, DAG);
12654 case ISD::FLOG10: return lowerLog10(Op, DAG);
12655 case ISD::FEXP: return lowerExp(Op, DAG);
12656 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
12657 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
12658 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
12659 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
12660 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
12661 case ISD::STRICT_FSETCC:
12662 case ISD::STRICT_FSETCCS:
12663 case ISD::SETCC: return LowerSETCC(Op, DAG);
12664 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
12665 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
12666 case ISD::SSUBO:
12667 return LowerSSUBO(Op, DAG);
12668 case ISD::SADDO:
12669 return LowerSADDO(Op, DAG);
12670
12671 case ISD::INLINEASM:
12672 case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG);
12673 // Variable argument lowering.
12674 case ISD::VASTART: return LowerVASTART(Op, DAG);
12675 case ISD::VAARG: return LowerVAARG(Op, DAG);
12676 case ISD::VACOPY: return LowerVACOPY(Op, DAG);
12677
12678 case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
12679 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
12680 case ISD::GET_DYNAMIC_AREA_OFFSET:
12681 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
12682
12683 // Exception handling lowering.
12684 case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
12685 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
12686 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
12687
12688 case ISD::LOAD: return LowerLOAD(Op, DAG);
12689 case ISD::STORE: return LowerSTORE(Op, DAG);
12690 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
12691 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
12692 case ISD::STRICT_FP_TO_UINT:
12693 case ISD::STRICT_FP_TO_SINT:
12694 case ISD::FP_TO_UINT:
12695 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, dl: SDLoc(Op));
12696 case ISD::STRICT_UINT_TO_FP:
12697 case ISD::STRICT_SINT_TO_FP:
12698 case ISD::UINT_TO_FP:
12699 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
12700 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
12701 case ISD::SET_ROUNDING:
12702 return LowerSET_ROUNDING(Op, DAG);
12703
12704 // Lower 64-bit shifts.
12705 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
12706 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
12707 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
12708
12709 case ISD::FSHL: return LowerFunnelShift(Op, DAG);
12710 case ISD::FSHR: return LowerFunnelShift(Op, DAG);
12711
12712 // Vector-related lowering.
12713 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
12714 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
12715 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
12716 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
12717 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
12718 case ISD::MUL: return LowerMUL(Op, DAG);
12719 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
12720 case ISD::STRICT_FP_ROUND:
12721 case ISD::FP_ROUND:
12722 return LowerFP_ROUND(Op, DAG);
12723 case ISD::ROTL: return LowerROTL(Op, DAG);
12724
12725 // For counter-based loop handling.
12726 case ISD::INTRINSIC_W_CHAIN:
12727 return SDValue();
12728
12729 case ISD::BITCAST: return LowerBITCAST(Op, DAG);
12730
12731 // Frame & Return address.
12732 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
12733 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
12734
12735 case ISD::INTRINSIC_VOID:
12736 return LowerINTRINSIC_VOID(Op, DAG);
12737 case ISD::BSWAP:
12738 return LowerBSWAP(Op, DAG);
12739 case ISD::ATOMIC_CMP_SWAP:
12740 return LowerATOMIC_CMP_SWAP(Op, DAG);
12741 case ISD::ATOMIC_STORE:
12742 return LowerATOMIC_LOAD_STORE(Op, DAG);
12743 case ISD::IS_FPCLASS:
12744 return LowerIS_FPCLASS(Op, DAG);
12745 case ISD::UADDO:
12746 case ISD::USUBO:
12747 return LowerADDSUBO(Op, DAG);
12748 case ISD::UADDO_CARRY:
12749 case ISD::USUBO_CARRY:
12750 return LowerADDSUBO_CARRY(Op, DAG);
12751 case ISD::UCMP:
12752 return LowerUCMP(Op, DAG);
12753 case ISD::STRICT_LRINT:
12754 case ISD::STRICT_LLRINT:
12755 case ISD::STRICT_LROUND:
12756 case ISD::STRICT_LLROUND:
12757 case ISD::STRICT_FNEARBYINT:
12758 if (Op->getFlags().hasNoFPExcept())
12759 return Op;
12760 return SDValue();
12761 case ISD::VP_LOAD:
12762 return LowerVP_LOAD(Op, DAG);
12763 case ISD::VP_STORE:
12764 return LowerVP_STORE(Op, DAG);
12765 }
12766}
12767
12768void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
12769 SmallVectorImpl<SDValue>&Results,
12770 SelectionDAG &DAG) const {
12771 SDLoc dl(N);
12772 switch (N->getOpcode()) {
12773 default:
12774 llvm_unreachable("Do not know how to custom type legalize this operation!");
12775 case ISD::ATOMIC_LOAD: {
12776 SDValue Res = LowerATOMIC_LOAD_STORE(Op: SDValue(N, 0), DAG);
12777 Results.push_back(Elt: Res);
12778 Results.push_back(Elt: Res.getValue(R: 1));
12779 break;
12780 }
12781 case ISD::READCYCLECOUNTER: {
12782 SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other);
12783 SDValue RTB = DAG.getNode(Opcode: PPCISD::READ_TIME_BASE, DL: dl, VTList: VTs, N: N->getOperand(Num: 0));
12784
12785 Results.push_back(
12786 Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: RTB, N2: RTB.getValue(R: 1)));
12787 Results.push_back(Elt: RTB.getValue(R: 2));
12788 break;
12789 }
12790 case ISD::INTRINSIC_W_CHAIN: {
12791 if (N->getConstantOperandVal(Num: 1) != Intrinsic::loop_decrement)
12792 break;
12793
12794 assert(N->getValueType(0) == MVT::i1 &&
12795 "Unexpected result type for CTR decrement intrinsic");
12796 EVT SVT = getSetCCResultType(DL: DAG.getDataLayout(), C&: *DAG.getContext(),
12797 VT: N->getValueType(ResNo: 0));
12798 SDVTList VTs = DAG.getVTList(VT1: SVT, VT2: MVT::Other);
12799 SDValue NewInt = DAG.getNode(Opcode: N->getOpcode(), DL: dl, VTList: VTs, N1: N->getOperand(Num: 0),
12800 N2: N->getOperand(Num: 1));
12801
12802 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: NewInt));
12803 Results.push_back(Elt: NewInt.getValue(R: 1));
12804 break;
12805 }
12806 case ISD::INTRINSIC_WO_CHAIN: {
12807 switch (N->getConstantOperandVal(Num: 0)) {
12808 case Intrinsic::ppc_pack_longdouble:
12809 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::ppcf128,
12810 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 1)));
12811 break;
12812 case Intrinsic::ppc_maxfe:
12813 case Intrinsic::ppc_minfe:
12814 case Intrinsic::ppc_fnmsub:
12815 case Intrinsic::ppc_convert_f128_to_ppcf128:
12816 Results.push_back(Elt: LowerINTRINSIC_WO_CHAIN(Op: SDValue(N, 0), DAG));
12817 break;
12818 }
12819 break;
12820 }
12821 case ISD::VAARG: {
12822 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
12823 return;
12824
12825 EVT VT = N->getValueType(ResNo: 0);
12826
12827 if (VT == MVT::i64) {
12828 SDValue NewNode = LowerVAARG(Op: SDValue(N, 1), DAG);
12829
12830 Results.push_back(Elt: NewNode);
12831 Results.push_back(Elt: NewNode.getValue(R: 1));
12832 }
12833 return;
12834 }
12835 case ISD::STRICT_FP_TO_SINT:
12836 case ISD::STRICT_FP_TO_UINT:
12837 case ISD::FP_TO_SINT:
12838 case ISD::FP_TO_UINT: {
12839 // LowerFP_TO_INT() can only handle f32 and f64.
12840 if (N->getOperand(Num: N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
12841 MVT::ppcf128)
12842 return;
12843 SDValue LoweredValue = LowerFP_TO_INT(Op: SDValue(N, 0), DAG, dl);
12844 Results.push_back(Elt: LoweredValue);
12845 if (N->isStrictFPOpcode())
12846 Results.push_back(Elt: LoweredValue.getValue(R: 1));
12847 return;
12848 }
12849 case ISD::TRUNCATE: {
12850 if (!N->getValueType(ResNo: 0).isVector())
12851 return;
12852 SDValue Lowered = LowerTRUNCATEVector(Op: SDValue(N, 0), DAG);
12853 if (Lowered)
12854 Results.push_back(Elt: Lowered);
12855 return;
12856 }
12857 case ISD::SCALAR_TO_VECTOR: {
12858 SDValue Lowered = LowerSCALAR_TO_VECTOR(Op: SDValue(N, 0), DAG);
12859 if (Lowered)
12860 Results.push_back(Elt: Lowered);
12861 return;
12862 }
12863 case ISD::FSHL:
12864 case ISD::FSHR:
12865 // Don't handle funnel shifts here.
12866 return;
12867 case ISD::BITCAST:
12868 // Don't handle bitcast here.
12869 return;
12870 case ISD::FP_EXTEND:
12871 SDValue Lowered = LowerFP_EXTEND(Op: SDValue(N, 0), DAG);
12872 if (Lowered)
12873 Results.push_back(Elt: Lowered);
12874 return;
12875 }
12876}
12877
12878//===----------------------------------------------------------------------===//
12879// Other Lowering Code
12880//===----------------------------------------------------------------------===//
12881
12882static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
12883 return Builder.CreateIntrinsic(ID: Id, Args: {});
12884}
12885
12886Value *PPCTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
12887 Value *Addr,
12888 AtomicOrdering Ord) const {
12889 unsigned SZ = ValueTy->getPrimitiveSizeInBits();
12890
12891 assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
12892 "Only 8/16/32/64-bit atomic loads supported");
12893 Intrinsic::ID IntID;
12894 switch (SZ) {
12895 default:
12896 llvm_unreachable("Unexpected PrimitiveSize");
12897 case 8:
12898 IntID = Intrinsic::ppc_lbarx;
12899 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12900 break;
12901 case 16:
12902 IntID = Intrinsic::ppc_lharx;
12903 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12904 break;
12905 case 32:
12906 IntID = Intrinsic::ppc_lwarx;
12907 break;
12908 case 64:
12909 IntID = Intrinsic::ppc_ldarx;
12910 break;
12911 }
12912 Value *Call =
12913 Builder.CreateIntrinsic(ID: IntID, Args: Addr, /*FMFSource=*/nullptr, Name: "larx");
12914
12915 return Builder.CreateTruncOrBitCast(V: Call, DestTy: ValueTy);
12916}
12917
12918// Perform a store-conditional operation to Addr. Return the status of the
12919// store. This should be 0 if the store succeeded, non-zero otherwise.
12920Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
12921 Value *Val, Value *Addr,
12922 AtomicOrdering Ord) const {
12923 Type *Ty = Val->getType();
12924 unsigned SZ = Ty->getPrimitiveSizeInBits();
12925
12926 assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
12927 "Only 8/16/32/64-bit atomic loads supported");
12928 Intrinsic::ID IntID;
12929 switch (SZ) {
12930 default:
12931 llvm_unreachable("Unexpected PrimitiveSize");
12932 case 8:
12933 IntID = Intrinsic::ppc_stbcx;
12934 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12935 break;
12936 case 16:
12937 IntID = Intrinsic::ppc_sthcx;
12938 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12939 break;
12940 case 32:
12941 IntID = Intrinsic::ppc_stwcx;
12942 break;
12943 case 64:
12944 IntID = Intrinsic::ppc_stdcx;
12945 break;
12946 }
12947
12948 if (SZ == 8 || SZ == 16)
12949 Val = Builder.CreateZExt(V: Val, DestTy: Builder.getInt32Ty());
12950
12951 Value *Call = Builder.CreateIntrinsic(ID: IntID, Args: {Addr, Val},
12952 /*FMFSource=*/nullptr, Name: "stcx");
12953 return Builder.CreateXor(LHS: Call, RHS: Builder.getInt32(C: 1));
12954}
12955
12956// The mappings for emitLeading/TrailingFence is taken from
12957// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
12958Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
12959 Instruction *Inst,
12960 AtomicOrdering Ord) const {
12961 if (Ord == AtomicOrdering::SequentiallyConsistent)
12962 return callIntrinsic(Builder, Id: Intrinsic::ppc_sync);
12963 if (isReleaseOrStronger(AO: Ord))
12964 return callIntrinsic(Builder, Id: Intrinsic::ppc_lwsync);
12965 return nullptr;
12966}
12967
12968Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
12969 Instruction *Inst,
12970 AtomicOrdering Ord) const {
12971 if (Inst->hasAtomicLoad() && isAcquireOrStronger(AO: Ord)) {
12972 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
12973 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
12974 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
12975 if (isa<LoadInst>(Val: Inst))
12976 return Builder.CreateIntrinsic(ID: Intrinsic::ppc_cfence, Types: {Inst->getType()},
12977 Args: {Inst});
12978 // FIXME: Can use isync for rmw operation.
12979 return callIntrinsic(Builder, Id: Intrinsic::ppc_lwsync);
12980 }
12981 return nullptr;
12982}
12983
12984MachineBasicBlock *
12985PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
12986 unsigned AtomicSize,
12987 unsigned BinOpcode,
12988 unsigned CmpOpcode,
12989 unsigned CmpPred) const {
12990 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
12991 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12992
12993 auto LoadMnemonic = PPC::LDARX;
12994 auto StoreMnemonic = PPC::STDCX;
12995 switch (AtomicSize) {
12996 default:
12997 llvm_unreachable("Unexpected size of atomic entity");
12998 case 1:
12999 LoadMnemonic = PPC::LBARX;
13000 StoreMnemonic = PPC::STBCX;
13001 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
13002 break;
13003 case 2:
13004 LoadMnemonic = PPC::LHARX;
13005 StoreMnemonic = PPC::STHCX;
13006 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
13007 break;
13008 case 4:
13009 LoadMnemonic = PPC::LWARX;
13010 StoreMnemonic = PPC::STWCX;
13011 break;
13012 case 8:
13013 LoadMnemonic = PPC::LDARX;
13014 StoreMnemonic = PPC::STDCX;
13015 break;
13016 }
13017
13018 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13019 MachineFunction *F = BB->getParent();
13020 MachineFunction::iterator It = ++BB->getIterator();
13021
13022 Register dest = MI.getOperand(i: 0).getReg();
13023 Register ptrA = MI.getOperand(i: 1).getReg();
13024 Register ptrB = MI.getOperand(i: 2).getReg();
13025 Register incr = MI.getOperand(i: 3).getReg();
13026 DebugLoc dl = MI.getDebugLoc();
13027
13028 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13029 MachineBasicBlock *loop2MBB =
13030 CmpOpcode ? F->CreateMachineBasicBlock(BB: LLVM_BB) : nullptr;
13031 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13032 F->insert(MBBI: It, MBB: loopMBB);
13033 if (CmpOpcode)
13034 F->insert(MBBI: It, MBB: loop2MBB);
13035 F->insert(MBBI: It, MBB: exitMBB);
13036 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
13037 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13038 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13039
13040 MachineRegisterInfo &RegInfo = F->getRegInfo();
13041 Register TmpReg = (!BinOpcode) ? incr :
13042 RegInfo.createVirtualRegister( RegClass: AtomicSize == 8 ? &PPC::G8RCRegClass
13043 : &PPC::GPRCRegClass);
13044
13045 // thisMBB:
13046 // ...
13047 // fallthrough --> loopMBB
13048 BB->addSuccessor(Succ: loopMBB);
13049
13050 // loopMBB:
13051 // l[wd]arx dest, ptr
13052 // add r0, dest, incr
13053 // st[wd]cx. r0, ptr
13054 // bne- loopMBB
13055 // fallthrough --> exitMBB
13056
13057 // For max/min...
13058 // loopMBB:
13059 // l[wd]arx dest, ptr
13060 // cmpl?[wd] dest, incr
13061 // bgt exitMBB
13062 // loop2MBB:
13063 // st[wd]cx. dest, ptr
13064 // bne- loopMBB
13065 // fallthrough --> exitMBB
13066
13067 BB = loopMBB;
13068 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: LoadMnemonic), DestReg: dest)
13069 .addReg(RegNo: ptrA).addReg(RegNo: ptrB);
13070 if (BinOpcode)
13071 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: BinOpcode), DestReg: TmpReg).addReg(RegNo: incr).addReg(RegNo: dest);
13072 if (CmpOpcode) {
13073 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13074 // Signed comparisons of byte or halfword values must be sign-extended.
13075 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
13076 Register ExtReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13077 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
13078 DestReg: ExtReg).addReg(RegNo: dest);
13079 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: ExtReg).addReg(RegNo: incr);
13080 } else
13081 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: dest).addReg(RegNo: incr);
13082
13083 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13084 .addImm(Val: CmpPred)
13085 .addReg(RegNo: CrReg)
13086 .addMBB(MBB: exitMBB);
13087 BB->addSuccessor(Succ: loop2MBB);
13088 BB->addSuccessor(Succ: exitMBB);
13089 BB = loop2MBB;
13090 }
13091 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: StoreMnemonic))
13092 .addReg(RegNo: TmpReg).addReg(RegNo: ptrA).addReg(RegNo: ptrB);
13093 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13094 .addImm(Val: PPC::PRED_NE_MINUS)
13095 .addReg(RegNo: PPC::CR0)
13096 .addMBB(MBB: loopMBB);
13097 BB->addSuccessor(Succ: loopMBB);
13098 BB->addSuccessor(Succ: exitMBB);
13099
13100 // exitMBB:
13101 // ...
13102 BB = exitMBB;
13103 return BB;
13104}
13105
13106static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
13107 switch(MI.getOpcode()) {
13108 default:
13109 return false;
13110 case PPC::COPY:
13111 return TII->isSignExtended(Reg: MI.getOperand(i: 1).getReg(),
13112 MRI: &MI.getMF()->getRegInfo());
13113 case PPC::LHA:
13114 case PPC::LHA8:
13115 case PPC::LHAU:
13116 case PPC::LHAU8:
13117 case PPC::LHAUX:
13118 case PPC::LHAUX8:
13119 case PPC::LHAX:
13120 case PPC::LHAX8:
13121 case PPC::LWA:
13122 case PPC::LWAUX:
13123 case PPC::LWAX:
13124 case PPC::LWAX_32:
13125 case PPC::LWA_32:
13126 case PPC::PLHA:
13127 case PPC::PLHA8:
13128 case PPC::PLHA8pc:
13129 case PPC::PLHApc:
13130 case PPC::PLWA:
13131 case PPC::PLWA8:
13132 case PPC::PLWA8pc:
13133 case PPC::PLWApc:
13134 case PPC::EXTSB:
13135 case PPC::EXTSB8:
13136 case PPC::EXTSB8_32_64:
13137 case PPC::EXTSB8_rec:
13138 case PPC::EXTSB_rec:
13139 case PPC::EXTSH:
13140 case PPC::EXTSH8:
13141 case PPC::EXTSH8_32_64:
13142 case PPC::EXTSH8_rec:
13143 case PPC::EXTSH_rec:
13144 case PPC::EXTSW:
13145 case PPC::EXTSWSLI:
13146 case PPC::EXTSWSLI_32_64:
13147 case PPC::EXTSWSLI_32_64_rec:
13148 case PPC::EXTSWSLI_rec:
13149 case PPC::EXTSW_32:
13150 case PPC::EXTSW_32_64:
13151 case PPC::EXTSW_32_64_rec:
13152 case PPC::EXTSW_rec:
13153 case PPC::SRAW:
13154 case PPC::SRAWI:
13155 case PPC::SRAWI_rec:
13156 case PPC::SRAW_rec:
13157 return true;
13158 }
13159 return false;
13160}
13161
13162MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
13163 MachineInstr &MI, MachineBasicBlock *BB,
13164 bool is8bit, // operation
13165 unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
13166 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
13167 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
13168
13169 // If this is a signed comparison and the value being compared is not known
13170 // to be sign extended, sign extend it here.
13171 DebugLoc dl = MI.getDebugLoc();
13172 MachineFunction *F = BB->getParent();
13173 MachineRegisterInfo &RegInfo = F->getRegInfo();
13174 Register incr = MI.getOperand(i: 3).getReg();
13175 bool IsSignExtended =
13176 incr.isVirtual() && isSignExtended(MI&: *RegInfo.getVRegDef(Reg: incr), TII);
13177
13178 if (CmpOpcode == PPC::CMPW && !IsSignExtended) {
13179 Register ValueReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13180 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: is8bit ? PPC::EXTSB : PPC::EXTSH), DestReg: ValueReg)
13181 .addReg(RegNo: MI.getOperand(i: 3).getReg());
13182 MI.getOperand(i: 3).setReg(ValueReg);
13183 incr = ValueReg;
13184 }
13185 // If we support part-word atomic mnemonics, just use them
13186 if (Subtarget.hasPartwordAtomics())
13187 return EmitAtomicBinary(MI, BB, AtomicSize: is8bit ? 1 : 2, BinOpcode, CmpOpcode,
13188 CmpPred);
13189
13190 // In 64 bit mode we have to use 64 bits for addresses, even though the
13191 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
13192 // registers without caring whether they're 32 or 64, but here we're
13193 // doing actual arithmetic on the addresses.
13194 bool is64bit = Subtarget.isPPC64();
13195 bool isLittleEndian = Subtarget.isLittleEndian();
13196 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
13197
13198 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13199 MachineFunction::iterator It = ++BB->getIterator();
13200
13201 Register dest = MI.getOperand(i: 0).getReg();
13202 Register ptrA = MI.getOperand(i: 1).getReg();
13203 Register ptrB = MI.getOperand(i: 2).getReg();
13204
13205 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13206 MachineBasicBlock *loop2MBB =
13207 CmpOpcode ? F->CreateMachineBasicBlock(BB: LLVM_BB) : nullptr;
13208 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13209 F->insert(MBBI: It, MBB: loopMBB);
13210 if (CmpOpcode)
13211 F->insert(MBBI: It, MBB: loop2MBB);
13212 F->insert(MBBI: It, MBB: exitMBB);
13213 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
13214 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13215 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13216
13217 const TargetRegisterClass *RC =
13218 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13219 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13220
13221 Register PtrReg = RegInfo.createVirtualRegister(RegClass: RC);
13222 Register Shift1Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13223 Register ShiftReg =
13224 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
13225 Register Incr2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13226 Register MaskReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13227 Register Mask2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13228 Register Mask3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13229 Register Tmp2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13230 Register Tmp3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13231 Register Tmp4Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
13232 Register TmpDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13233 Register SrwDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13234 Register Ptr1Reg;
13235 Register TmpReg =
13236 (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
13237
13238 // thisMBB:
13239 // ...
13240 // fallthrough --> loopMBB
13241 BB->addSuccessor(Succ: loopMBB);
13242
13243 // The 4-byte load must be aligned, while a char or short may be
13244 // anywhere in the word. Hence all this nasty bookkeeping code.
13245 // add ptr1, ptrA, ptrB [copy if ptrA==0]
13246 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
13247 // xori shift, shift1, 24 [16]
13248 // rlwinm ptr, ptr1, 0, 0, 29
13249 // slw incr2, incr, shift
13250 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
13251 // slw mask, mask2, shift
13252 // loopMBB:
13253 // lwarx tmpDest, ptr
13254 // add tmp, tmpDest, incr2
13255 // andc tmp2, tmpDest, mask
13256 // and tmp3, tmp, mask
13257 // or tmp4, tmp3, tmp2
13258 // stwcx. tmp4, ptr
13259 // bne- loopMBB
13260 // fallthrough --> exitMBB
13261 // srw SrwDest, tmpDest, shift
13262 // rlwinm SrwDest, SrwDest, 0, 24 [16], 31
13263 if (ptrA != ZeroReg) {
13264 Ptr1Reg = RegInfo.createVirtualRegister(RegClass: RC);
13265 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::ADD8 : PPC::ADD4), DestReg: Ptr1Reg)
13266 .addReg(RegNo: ptrA)
13267 .addReg(RegNo: ptrB);
13268 } else {
13269 Ptr1Reg = ptrB;
13270 }
13271 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
13272 // mode.
13273 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: Shift1Reg)
13274 .addReg(RegNo: Ptr1Reg, Flags: {}, SubReg: is64bit ? PPC::sub_32 : 0)
13275 .addImm(Val: 3)
13276 .addImm(Val: 27)
13277 .addImm(Val: is8bit ? 28 : 27);
13278 if (!isLittleEndian)
13279 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::XORI), DestReg: ShiftReg)
13280 .addReg(RegNo: Shift1Reg)
13281 .addImm(Val: is8bit ? 24 : 16);
13282 if (is64bit)
13283 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDICR), DestReg: PtrReg)
13284 .addReg(RegNo: Ptr1Reg)
13285 .addImm(Val: 0)
13286 .addImm(Val: 61);
13287 else
13288 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: PtrReg)
13289 .addReg(RegNo: Ptr1Reg)
13290 .addImm(Val: 0)
13291 .addImm(Val: 0)
13292 .addImm(Val: 29);
13293 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: Incr2Reg).addReg(RegNo: incr).addReg(RegNo: ShiftReg);
13294 if (is8bit)
13295 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask2Reg).addImm(Val: 255);
13296 else {
13297 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask3Reg).addImm(Val: 0);
13298 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ORI), DestReg: Mask2Reg)
13299 .addReg(RegNo: Mask3Reg)
13300 .addImm(Val: 65535);
13301 }
13302 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: MaskReg)
13303 .addReg(RegNo: Mask2Reg)
13304 .addReg(RegNo: ShiftReg);
13305
13306 BB = loopMBB;
13307 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LWARX), DestReg: TmpDestReg)
13308 .addReg(RegNo: ZeroReg)
13309 .addReg(RegNo: PtrReg);
13310 if (BinOpcode)
13311 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: BinOpcode), DestReg: TmpReg)
13312 .addReg(RegNo: Incr2Reg)
13313 .addReg(RegNo: TmpDestReg);
13314 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ANDC), DestReg: Tmp2Reg)
13315 .addReg(RegNo: TmpDestReg)
13316 .addReg(RegNo: MaskReg);
13317 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: Tmp3Reg).addReg(RegNo: TmpReg).addReg(RegNo: MaskReg);
13318 if (CmpOpcode) {
13319 // For unsigned comparisons, we can directly compare the shifted values.
13320 // For signed comparisons we shift and sign extend.
13321 Register SReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13322 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13323 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: SReg)
13324 .addReg(RegNo: TmpDestReg)
13325 .addReg(RegNo: MaskReg);
13326 unsigned ValueReg = SReg;
13327 unsigned CmpReg = Incr2Reg;
13328 if (CmpOpcode == PPC::CMPW) {
13329 ValueReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13330 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: ValueReg)
13331 .addReg(RegNo: SReg)
13332 .addReg(RegNo: ShiftReg);
13333 Register ValueSReg = RegInfo.createVirtualRegister(RegClass: GPRC);
13334 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is8bit ? PPC::EXTSB : PPC::EXTSH), DestReg: ValueSReg)
13335 .addReg(RegNo: ValueReg);
13336 ValueReg = ValueSReg;
13337 CmpReg = incr;
13338 }
13339 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: CmpOpcode), DestReg: CrReg).addReg(RegNo: ValueReg).addReg(RegNo: CmpReg);
13340 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13341 .addImm(Val: CmpPred)
13342 .addReg(RegNo: CrReg)
13343 .addMBB(MBB: exitMBB);
13344 BB->addSuccessor(Succ: loop2MBB);
13345 BB->addSuccessor(Succ: exitMBB);
13346 BB = loop2MBB;
13347 }
13348 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::OR), DestReg: Tmp4Reg).addReg(RegNo: Tmp3Reg).addReg(RegNo: Tmp2Reg);
13349 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::STWCX))
13350 .addReg(RegNo: Tmp4Reg)
13351 .addReg(RegNo: ZeroReg)
13352 .addReg(RegNo: PtrReg);
13353 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13354 .addImm(Val: PPC::PRED_NE_MINUS)
13355 .addReg(RegNo: PPC::CR0)
13356 .addMBB(MBB: loopMBB);
13357 BB->addSuccessor(Succ: loopMBB);
13358 BB->addSuccessor(Succ: exitMBB);
13359
13360 // exitMBB:
13361 // ...
13362 BB = exitMBB;
13363 // Since the shift amount is not a constant, we need to clear
13364 // the upper bits with a separate RLWINM.
13365 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: dest)
13366 .addReg(RegNo: SrwDestReg)
13367 .addImm(Val: 0)
13368 .addImm(Val: is8bit ? 24 : 16)
13369 .addImm(Val: 31);
13370 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: SrwDestReg)
13371 .addReg(RegNo: TmpDestReg)
13372 .addReg(RegNo: ShiftReg);
13373 return BB;
13374}
13375
13376llvm::MachineBasicBlock *
13377PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
13378 MachineBasicBlock *MBB) const {
13379 DebugLoc DL = MI.getDebugLoc();
13380 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13381 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
13382
13383 MachineFunction *MF = MBB->getParent();
13384 MachineRegisterInfo &MRI = MF->getRegInfo();
13385
13386 const BasicBlock *BB = MBB->getBasicBlock();
13387 MachineFunction::iterator I = ++MBB->getIterator();
13388
13389 Register DstReg = MI.getOperand(i: 0).getReg();
13390 const TargetRegisterClass *RC = MRI.getRegClass(Reg: DstReg);
13391 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
13392 Register mainDstReg = MRI.createVirtualRegister(RegClass: RC);
13393 Register restoreDstReg = MRI.createVirtualRegister(RegClass: RC);
13394
13395 MVT PVT = getPointerTy(DL: MF->getDataLayout());
13396 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13397 "Invalid Pointer Size!");
13398 // For v = setjmp(buf), we generate
13399 //
13400 // thisMBB:
13401 // SjLjSetup mainMBB
13402 // bl mainMBB
13403 // v_restore = 1
13404 // b sinkMBB
13405 //
13406 // mainMBB:
13407 // buf[LabelOffset] = LR
13408 // v_main = 0
13409 //
13410 // sinkMBB:
13411 // v = phi(main, restore)
13412 //
13413
13414 MachineBasicBlock *thisMBB = MBB;
13415 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13416 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13417 MF->insert(MBBI: I, MBB: mainMBB);
13418 MF->insert(MBBI: I, MBB: sinkMBB);
13419
13420 MachineInstrBuilder MIB;
13421
13422 // Transfer the remainder of BB and its successor edges to sinkMBB.
13423 sinkMBB->splice(Where: sinkMBB->begin(), Other: MBB,
13424 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
13425 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
13426
13427 // Note that the structure of the jmp_buf used here is not compatible
13428 // with that used by libc, and is not designed to be. Specifically, it
13429 // stores only those 'reserved' registers that LLVM does not otherwise
13430 // understand how to spill. Also, by convention, by the time this
13431 // intrinsic is called, Clang has already stored the frame address in the
13432 // first slot of the buffer and stack address in the third. Following the
13433 // X86 target code, we'll store the jump address in the second slot. We also
13434 // need to save the TOC pointer (R2) to handle jumps between shared
13435 // libraries, and that will be stored in the fourth slot. The thread
13436 // identifier (R13) is not affected.
13437
13438 // thisMBB:
13439 const int64_t LabelOffset = 1 * PVT.getStoreSize();
13440 const int64_t TOCOffset = 3 * PVT.getStoreSize();
13441 const int64_t BPOffset = 4 * PVT.getStoreSize();
13442
13443 // Prepare IP either in reg.
13444 const TargetRegisterClass *PtrRC = getRegClassFor(VT: PVT);
13445 Register LabelReg = MRI.createVirtualRegister(RegClass: PtrRC);
13446 Register BufReg = MI.getOperand(i: 1).getReg();
13447
13448 if (Subtarget.is64BitELFABI()) {
13449 setUsesTOCBasePtr(*MBB->getParent());
13450 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::STD))
13451 .addReg(RegNo: PPC::X2)
13452 .addImm(Val: TOCOffset)
13453 .addReg(RegNo: BufReg)
13454 .cloneMemRefs(OtherMI: MI);
13455 }
13456
13457 // Naked functions never have a base pointer, and so we use r1. For all
13458 // other functions, this decision must be delayed until during PEI.
13459 unsigned BaseReg;
13460 if (MF->getFunction().hasFnAttribute(Kind: Attribute::Naked))
13461 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
13462 else
13463 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
13464
13465 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL,
13466 MCID: TII->get(Opcode: Subtarget.isPPC64() ? PPC::STD : PPC::STW))
13467 .addReg(RegNo: BaseReg)
13468 .addImm(Val: BPOffset)
13469 .addReg(RegNo: BufReg)
13470 .cloneMemRefs(OtherMI: MI);
13471
13472 // Setup
13473 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::BCLalways)).addMBB(MBB: mainMBB);
13474 MIB.addRegMask(Mask: TRI->getNoPreservedMask());
13475
13476 BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LI), DestReg: restoreDstReg).addImm(Val: 1);
13477
13478 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::EH_SjLj_Setup))
13479 .addMBB(MBB: mainMBB);
13480 MIB = BuildMI(BB&: *thisMBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: sinkMBB);
13481
13482 thisMBB->addSuccessor(Succ: mainMBB, Prob: BranchProbability::getZero());
13483 thisMBB->addSuccessor(Succ: sinkMBB, Prob: BranchProbability::getOne());
13484
13485 // mainMBB:
13486 // mainDstReg = 0
13487 MIB =
13488 BuildMI(BB: mainMBB, MIMD: DL,
13489 MCID: TII->get(Opcode: Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), DestReg: LabelReg);
13490
13491 // Store IP
13492 if (Subtarget.isPPC64()) {
13493 MIB = BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::STD))
13494 .addReg(RegNo: LabelReg)
13495 .addImm(Val: LabelOffset)
13496 .addReg(RegNo: BufReg);
13497 } else {
13498 MIB = BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::STW))
13499 .addReg(RegNo: LabelReg)
13500 .addImm(Val: LabelOffset)
13501 .addReg(RegNo: BufReg);
13502 }
13503 MIB.cloneMemRefs(OtherMI: MI);
13504
13505 BuildMI(BB: mainMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::LI), DestReg: mainDstReg).addImm(Val: 0);
13506 mainMBB->addSuccessor(Succ: sinkMBB);
13507
13508 // sinkMBB:
13509 BuildMI(BB&: *sinkMBB, I: sinkMBB->begin(), MIMD: DL,
13510 MCID: TII->get(Opcode: PPC::PHI), DestReg: DstReg)
13511 .addReg(RegNo: mainDstReg).addMBB(MBB: mainMBB)
13512 .addReg(RegNo: restoreDstReg).addMBB(MBB: thisMBB);
13513
13514 MI.eraseFromParent();
13515 return sinkMBB;
13516}
13517
13518MachineBasicBlock *
13519PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
13520 MachineBasicBlock *MBB) const {
13521 DebugLoc DL = MI.getDebugLoc();
13522 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13523
13524 MachineFunction *MF = MBB->getParent();
13525 MachineRegisterInfo &MRI = MF->getRegInfo();
13526
13527 MVT PVT = getPointerTy(DL: MF->getDataLayout());
13528 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13529 "Invalid Pointer Size!");
13530
13531 const TargetRegisterClass *RC =
13532 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13533 Register Tmp = MRI.createVirtualRegister(RegClass: RC);
13534 // Since FP is only updated here but NOT referenced, it's treated as GPR.
13535 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
13536 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
13537 unsigned BP =
13538 (PVT == MVT::i64)
13539 ? PPC::X30
13540 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
13541 : PPC::R30);
13542
13543 MachineInstrBuilder MIB;
13544
13545 const int64_t LabelOffset = 1 * PVT.getStoreSize();
13546 const int64_t SPOffset = 2 * PVT.getStoreSize();
13547 const int64_t TOCOffset = 3 * PVT.getStoreSize();
13548 const int64_t BPOffset = 4 * PVT.getStoreSize();
13549
13550 Register BufReg = MI.getOperand(i: 0).getReg();
13551
13552 // Reload FP (the jumped-to function may not have had a
13553 // frame pointer, and if so, then its r31 will be restored
13554 // as necessary).
13555 if (PVT == MVT::i64) {
13556 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: FP)
13557 .addImm(Val: 0)
13558 .addReg(RegNo: BufReg);
13559 } else {
13560 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: FP)
13561 .addImm(Val: 0)
13562 .addReg(RegNo: BufReg);
13563 }
13564 MIB.cloneMemRefs(OtherMI: MI);
13565
13566 // Reload IP
13567 if (PVT == MVT::i64) {
13568 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: Tmp)
13569 .addImm(Val: LabelOffset)
13570 .addReg(RegNo: BufReg);
13571 } else {
13572 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: Tmp)
13573 .addImm(Val: LabelOffset)
13574 .addReg(RegNo: BufReg);
13575 }
13576 MIB.cloneMemRefs(OtherMI: MI);
13577
13578 // Reload SP
13579 if (PVT == MVT::i64) {
13580 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: SP)
13581 .addImm(Val: SPOffset)
13582 .addReg(RegNo: BufReg);
13583 } else {
13584 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: SP)
13585 .addImm(Val: SPOffset)
13586 .addReg(RegNo: BufReg);
13587 }
13588 MIB.cloneMemRefs(OtherMI: MI);
13589
13590 // Reload BP
13591 if (PVT == MVT::i64) {
13592 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: BP)
13593 .addImm(Val: BPOffset)
13594 .addReg(RegNo: BufReg);
13595 } else {
13596 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LWZ), DestReg: BP)
13597 .addImm(Val: BPOffset)
13598 .addReg(RegNo: BufReg);
13599 }
13600 MIB.cloneMemRefs(OtherMI: MI);
13601
13602 // Reload TOC
13603 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
13604 setUsesTOCBasePtr(*MBB->getParent());
13605 MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::LD), DestReg: PPC::X2)
13606 .addImm(Val: TOCOffset)
13607 .addReg(RegNo: BufReg)
13608 .cloneMemRefs(OtherMI: MI);
13609 }
13610
13611 // Jump
13612 BuildMI(BB&: *MBB, I&: MI, MIMD: DL,
13613 MCID: TII->get(Opcode: PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(RegNo: Tmp);
13614 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
13615
13616 MI.eraseFromParent();
13617 return MBB;
13618}
13619
13620bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
13621 // If the function specifically requests inline stack probes, emit them.
13622 if (MF.getFunction().hasFnAttribute(Kind: "probe-stack"))
13623 return MF.getFunction().getFnAttribute(Kind: "probe-stack").getValueAsString() ==
13624 "inline-asm";
13625 return false;
13626}
13627
13628unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const {
13629 const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
13630 unsigned StackAlign = TFI->getStackAlignment();
13631 assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
13632 "Unexpected stack alignment");
13633 // The default stack probe size is 4096 if the function has no
13634 // stack-probe-size attribute.
13635 const Function &Fn = MF.getFunction();
13636 unsigned StackProbeSize =
13637 Fn.getFnAttributeAsParsedInteger(Kind: "stack-probe-size", Default: 4096);
13638 // Round down to the stack alignment.
13639 StackProbeSize &= ~(StackAlign - 1);
13640 return StackProbeSize ? StackProbeSize : StackAlign;
13641}
13642
13643// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
13644// into three phases. In the first phase, it uses pseudo instruction
13645// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
13646// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
13647// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
13648// MaxCallFrameSize so that it can calculate correct data area pointer.
13649MachineBasicBlock *
13650PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
13651 MachineBasicBlock *MBB) const {
13652 const bool isPPC64 = Subtarget.isPPC64();
13653 MachineFunction *MF = MBB->getParent();
13654 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13655 DebugLoc DL = MI.getDebugLoc();
13656 const unsigned ProbeSize = getStackProbeSize(MF: *MF);
13657 const BasicBlock *ProbedBB = MBB->getBasicBlock();
13658 MachineRegisterInfo &MRI = MF->getRegInfo();
13659 // The CFG of probing stack looks as
13660 // +-----+
13661 // | MBB |
13662 // +--+--+
13663 // |
13664 // +----v----+
13665 // +--->+ TestMBB +---+
13666 // | +----+----+ |
13667 // | | |
13668 // | +-----v----+ |
13669 // +---+ BlockMBB | |
13670 // +----------+ |
13671 // |
13672 // +---------+ |
13673 // | TailMBB +<--+
13674 // +---------+
13675 // In MBB, calculate previous frame pointer and final stack pointer.
13676 // In TestMBB, test if sp is equal to final stack pointer, if so, jump to
13677 // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
13678 // TailMBB is spliced via \p MI.
13679 MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13680 MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13681 MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(BB: ProbedBB);
13682
13683 MachineFunction::iterator MBBIter = ++MBB->getIterator();
13684 MF->insert(MBBI: MBBIter, MBB: TestMBB);
13685 MF->insert(MBBI: MBBIter, MBB: BlockMBB);
13686 MF->insert(MBBI: MBBIter, MBB: TailMBB);
13687
13688 const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
13689 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13690
13691 Register DstReg = MI.getOperand(i: 0).getReg();
13692 Register NegSizeReg = MI.getOperand(i: 1).getReg();
13693 Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
13694 Register FinalStackPtr = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13695 Register FramePointer = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13696 Register ActualNegSizeReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13697
13698 // Since value of NegSizeReg might be realigned in prologepilog, insert a
13699 // PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
13700 // NegSize.
13701 unsigned ProbeOpc;
13702 if (!MRI.hasOneNonDBGUse(RegNo: NegSizeReg))
13703 ProbeOpc =
13704 isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
13705 else
13706 // By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
13707 // and NegSizeReg will be allocated in the same phyreg to avoid
13708 // redundant copy when NegSizeReg has only one use which is current MI and
13709 // will be replaced by PREPARE_PROBED_ALLOCA then.
13710 ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
13711 : PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
13712 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: ProbeOpc), DestReg: FramePointer)
13713 .addDef(RegNo: ActualNegSizeReg)
13714 .addReg(RegNo: NegSizeReg)
13715 .add(MO: MI.getOperand(i: 2))
13716 .add(MO: MI.getOperand(i: 3));
13717
13718 // Calculate final stack pointer, which equals to SP + ActualNegSize.
13719 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ADD8 : PPC::ADD4),
13720 DestReg: FinalStackPtr)
13721 .addReg(RegNo: SPReg)
13722 .addReg(RegNo: ActualNegSizeReg);
13723
13724 // Materialize a scratch register for update.
13725 int64_t NegProbeSize = -(int64_t)ProbeSize;
13726 assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
13727 Register ScratchReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13728 if (!isInt<16>(x: NegProbeSize)) {
13729 Register TempReg = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13730 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::LIS8 : PPC::LIS), DestReg: TempReg)
13731 .addImm(Val: NegProbeSize >> 16);
13732 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ORI8 : PPC::ORI),
13733 DestReg: ScratchReg)
13734 .addReg(RegNo: TempReg)
13735 .addImm(Val: NegProbeSize & 0xFFFF);
13736 } else
13737 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::LI8 : PPC::LI), DestReg: ScratchReg)
13738 .addImm(Val: NegProbeSize);
13739
13740 {
13741 // Probing leading residual part.
13742 Register Div = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13743 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::DIVD : PPC::DIVW), DestReg: Div)
13744 .addReg(RegNo: ActualNegSizeReg)
13745 .addReg(RegNo: ScratchReg);
13746 Register Mul = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13747 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::MULLD : PPC::MULLW), DestReg: Mul)
13748 .addReg(RegNo: Div)
13749 .addReg(RegNo: ScratchReg);
13750 Register NegMod = MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13751 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::SUBF8 : PPC::SUBF), DestReg: NegMod)
13752 .addReg(RegNo: Mul)
13753 .addReg(RegNo: ActualNegSizeReg);
13754 BuildMI(BB&: *MBB, I&: {MI}, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::STDUX : PPC::STWUX), DestReg: SPReg)
13755 .addReg(RegNo: FramePointer)
13756 .addReg(RegNo: SPReg)
13757 .addReg(RegNo: NegMod);
13758 }
13759
13760 {
13761 // Remaining part should be multiple of ProbeSize.
13762 Register CmpResult = MRI.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
13763 BuildMI(BB: TestMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::CMPD : PPC::CMPW), DestReg: CmpResult)
13764 .addReg(RegNo: SPReg)
13765 .addReg(RegNo: FinalStackPtr);
13766 BuildMI(BB: TestMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::BCC))
13767 .addImm(Val: PPC::PRED_EQ)
13768 .addReg(RegNo: CmpResult)
13769 .addMBB(MBB: TailMBB);
13770 TestMBB->addSuccessor(Succ: BlockMBB);
13771 TestMBB->addSuccessor(Succ: TailMBB);
13772 }
13773
13774 {
13775 // Touch the block.
13776 // |P...|P...|P...
13777 BuildMI(BB: BlockMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::STDUX : PPC::STWUX), DestReg: SPReg)
13778 .addReg(RegNo: FramePointer)
13779 .addReg(RegNo: SPReg)
13780 .addReg(RegNo: ScratchReg);
13781 BuildMI(BB: BlockMBB, MIMD: DL, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: TestMBB);
13782 BlockMBB->addSuccessor(Succ: TestMBB);
13783 }
13784
13785 // Calculation of MaxCallFrameSize is deferred to prologepilog, use
13786 // DYNAREAOFFSET pseudo instruction to get the future result.
13787 Register MaxCallFrameSizeReg =
13788 MRI.createVirtualRegister(RegClass: isPPC64 ? G8RC : GPRC);
13789 BuildMI(BB: TailMBB, MIMD: DL,
13790 MCID: TII->get(Opcode: isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
13791 DestReg: MaxCallFrameSizeReg)
13792 .add(MO: MI.getOperand(i: 2))
13793 .add(MO: MI.getOperand(i: 3));
13794 BuildMI(BB: TailMBB, MIMD: DL, MCID: TII->get(Opcode: isPPC64 ? PPC::ADD8 : PPC::ADD4), DestReg: DstReg)
13795 .addReg(RegNo: SPReg)
13796 .addReg(RegNo: MaxCallFrameSizeReg);
13797
13798 // Splice instructions after MI to TailMBB.
13799 TailMBB->splice(Where: TailMBB->end(), Other: MBB,
13800 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
13801 TailMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
13802 MBB->addSuccessor(Succ: TestMBB);
13803
13804 // Delete the pseudo instruction.
13805 MI.eraseFromParent();
13806
13807 ++NumDynamicAllocaProbed;
13808 return TailMBB;
13809}
13810
13811static bool IsSelectCC(MachineInstr &MI) {
13812 switch (MI.getOpcode()) {
13813 case PPC::SELECT_CC_I4:
13814 case PPC::SELECT_CC_I8:
13815 case PPC::SELECT_CC_F4:
13816 case PPC::SELECT_CC_F8:
13817 case PPC::SELECT_CC_F16:
13818 case PPC::SELECT_CC_VRRC:
13819 case PPC::SELECT_CC_VSFRC:
13820 case PPC::SELECT_CC_VSSRC:
13821 case PPC::SELECT_CC_VSRC:
13822 case PPC::SELECT_CC_SPE4:
13823 case PPC::SELECT_CC_SPE:
13824 return true;
13825 default:
13826 return false;
13827 }
13828}
13829
13830static bool IsSelect(MachineInstr &MI) {
13831 switch (MI.getOpcode()) {
13832 case PPC::SELECT_I4:
13833 case PPC::SELECT_I8:
13834 case PPC::SELECT_F4:
13835 case PPC::SELECT_F8:
13836 case PPC::SELECT_F16:
13837 case PPC::SELECT_SPE:
13838 case PPC::SELECT_SPE4:
13839 case PPC::SELECT_VRRC:
13840 case PPC::SELECT_VSFRC:
13841 case PPC::SELECT_VSSRC:
13842 case PPC::SELECT_VSRC:
13843 return true;
13844 default:
13845 return false;
13846 }
13847}
13848
13849MachineBasicBlock *
13850PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
13851 MachineBasicBlock *BB) const {
13852 if (MI.getOpcode() == TargetOpcode::STACKMAP ||
13853 MI.getOpcode() == TargetOpcode::PATCHPOINT) {
13854 if (Subtarget.is64BitELFABI() &&
13855 MI.getOpcode() == TargetOpcode::PATCHPOINT &&
13856 !Subtarget.isUsingPCRelativeCalls()) {
13857 // Call lowering should have added an r2 operand to indicate a dependence
13858 // on the TOC base pointer value. It can't however, because there is no
13859 // way to mark the dependence as implicit there, and so the stackmap code
13860 // will confuse it with a regular operand. Instead, add the dependence
13861 // here.
13862 MI.addOperand(Op: MachineOperand::CreateReg(Reg: PPC::X2, isDef: false, isImp: true));
13863 }
13864
13865 return emitPatchPoint(MI, MBB: BB);
13866 }
13867
13868 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
13869 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
13870 return emitEHSjLjSetJmp(MI, MBB: BB);
13871 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
13872 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
13873 return emitEHSjLjLongJmp(MI, MBB: BB);
13874 }
13875
13876 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
13877
13878 // To "insert" these instructions we actually have to insert their
13879 // control-flow patterns.
13880 const BasicBlock *LLVM_BB = BB->getBasicBlock();
13881 MachineFunction::iterator It = ++BB->getIterator();
13882
13883 MachineFunction *F = BB->getParent();
13884 MachineRegisterInfo &MRI = F->getRegInfo();
13885
13886 if (Subtarget.hasISEL() &&
13887 (MI.getOpcode() == PPC::SELECT_CC_I4 ||
13888 MI.getOpcode() == PPC::SELECT_CC_I8 ||
13889 MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
13890 SmallVector<MachineOperand, 2> Cond;
13891 if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
13892 MI.getOpcode() == PPC::SELECT_CC_I8)
13893 Cond.push_back(Elt: MI.getOperand(i: 4));
13894 else
13895 Cond.push_back(Elt: MachineOperand::CreateImm(Val: PPC::PRED_BIT_SET));
13896 Cond.push_back(Elt: MI.getOperand(i: 1));
13897
13898 DebugLoc dl = MI.getDebugLoc();
13899 TII->insertSelect(MBB&: *BB, I: MI, DL: dl, DstReg: MI.getOperand(i: 0).getReg(), Cond,
13900 TrueReg: MI.getOperand(i: 2).getReg(), FalseReg: MI.getOperand(i: 3).getReg());
13901 } else if (IsSelectCC(MI) || IsSelect(MI)) {
13902 // The incoming instruction knows the destination vreg to set, the
13903 // condition code register to branch on, the true/false values to
13904 // select between, and a branch opcode to use.
13905
13906 // thisMBB:
13907 // ...
13908 // TrueVal = ...
13909 // cmpTY ccX, r1, r2
13910 // bCC sinkMBB
13911 // fallthrough --> copy0MBB
13912 MachineBasicBlock *thisMBB = BB;
13913 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13914 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13915 DebugLoc dl = MI.getDebugLoc();
13916 F->insert(MBBI: It, MBB: copy0MBB);
13917 F->insert(MBBI: It, MBB: sinkMBB);
13918
13919 if (isPhysRegUsedAfter(Reg: PPC::CARRY, MBI: MI.getIterator())) {
13920 copy0MBB->addLiveIn(PhysReg: PPC::CARRY);
13921 sinkMBB->addLiveIn(PhysReg: PPC::CARRY);
13922 }
13923
13924 // Set the call frame size on entry to the new basic blocks.
13925 // See https://reviews.llvm.org/D156113.
13926 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
13927 copy0MBB->setCallFrameSize(CallFrameSize);
13928 sinkMBB->setCallFrameSize(CallFrameSize);
13929
13930 // Transfer the remainder of BB and its successor edges to sinkMBB.
13931 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
13932 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13933 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13934
13935 // Next, add the true and fallthrough blocks as its successors.
13936 BB->addSuccessor(Succ: copy0MBB);
13937 BB->addSuccessor(Succ: sinkMBB);
13938
13939 if (IsSelect(MI)) {
13940 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BC))
13941 .addReg(RegNo: MI.getOperand(i: 1).getReg())
13942 .addMBB(MBB: sinkMBB);
13943 } else {
13944 unsigned SelectPred = MI.getOperand(i: 4).getImm();
13945 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
13946 .addImm(Val: SelectPred)
13947 .addReg(RegNo: MI.getOperand(i: 1).getReg())
13948 .addMBB(MBB: sinkMBB);
13949 }
13950
13951 // copy0MBB:
13952 // %FalseValue = ...
13953 // # fallthrough to sinkMBB
13954 BB = copy0MBB;
13955
13956 // Update machine-CFG edges
13957 BB->addSuccessor(Succ: sinkMBB);
13958
13959 // sinkMBB:
13960 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
13961 // ...
13962 BB = sinkMBB;
13963 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::PHI), DestReg: MI.getOperand(i: 0).getReg())
13964 .addReg(RegNo: MI.getOperand(i: 3).getReg())
13965 .addMBB(MBB: copy0MBB)
13966 .addReg(RegNo: MI.getOperand(i: 2).getReg())
13967 .addMBB(MBB: thisMBB);
13968 } else if (MI.getOpcode() == PPC::ReadTB) {
13969 // To read the 64-bit time-base register on a 32-bit target, we read the
13970 // two halves. Should the counter have wrapped while it was being read, we
13971 // need to try again.
13972 // ...
13973 // readLoop:
13974 // mfspr Rx,TBU # load from TBU
13975 // mfspr Ry,TB # load from TB
13976 // mfspr Rz,TBU # load from TBU
13977 // cmpw crX,Rx,Rz # check if 'old'='new'
13978 // bne readLoop # branch if they're not equal
13979 // ...
13980
13981 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13982 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
13983 DebugLoc dl = MI.getDebugLoc();
13984 F->insert(MBBI: It, MBB: readMBB);
13985 F->insert(MBBI: It, MBB: sinkMBB);
13986
13987 // Transfer the remainder of BB and its successor edges to sinkMBB.
13988 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
13989 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
13990 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
13991
13992 BB->addSuccessor(Succ: readMBB);
13993 BB = readMBB;
13994
13995 MachineRegisterInfo &RegInfo = F->getRegInfo();
13996 Register ReadAgainReg = RegInfo.createVirtualRegister(RegClass: &PPC::GPRCRegClass);
13997 Register LoReg = MI.getOperand(i: 0).getReg();
13998 Register HiReg = MI.getOperand(i: 1).getReg();
13999
14000 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: HiReg).addImm(Val: 269);
14001 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: LoReg).addImm(Val: 268);
14002 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::MFSPR), DestReg: ReadAgainReg).addImm(Val: 269);
14003
14004 Register CmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14005
14006 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::CMPW), DestReg: CmpReg)
14007 .addReg(RegNo: HiReg)
14008 .addReg(RegNo: ReadAgainReg);
14009 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14010 .addImm(Val: PPC::PRED_NE)
14011 .addReg(RegNo: CmpReg)
14012 .addMBB(MBB: readMBB);
14013
14014 BB->addSuccessor(Succ: readMBB);
14015 BB->addSuccessor(Succ: sinkMBB);
14016 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
14017 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::ADD4);
14018 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
14019 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::ADD4);
14020 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
14021 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::ADD4);
14022 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
14023 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::ADD8);
14024
14025 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
14026 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::AND);
14027 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
14028 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::AND);
14029 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
14030 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::AND);
14031 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
14032 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::AND8);
14033
14034 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
14035 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::OR);
14036 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
14037 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::OR);
14038 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
14039 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::OR);
14040 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
14041 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::OR8);
14042
14043 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
14044 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::XOR);
14045 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
14046 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::XOR);
14047 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
14048 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::XOR);
14049 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
14050 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::XOR8);
14051
14052 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
14053 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::NAND);
14054 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
14055 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::NAND);
14056 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
14057 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::NAND);
14058 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
14059 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::NAND8);
14060
14061 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
14062 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: PPC::SUBF);
14063 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
14064 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: PPC::SUBF);
14065 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
14066 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: PPC::SUBF);
14067 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
14068 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: PPC::SUBF8);
14069
14070 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
14071 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14072 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
14073 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14074 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
14075 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_LT);
14076 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
14077 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPD, CmpPred: PPC::PRED_LT);
14078
14079 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
14080 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14081 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
14082 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14083 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
14084 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPW, CmpPred: PPC::PRED_GT);
14085 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
14086 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPD, CmpPred: PPC::PRED_GT);
14087
14088 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
14089 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14090 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
14091 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14092 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
14093 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_LT);
14094 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
14095 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPLD, CmpPred: PPC::PRED_LT);
14096
14097 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
14098 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14099 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
14100 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14101 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
14102 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0, CmpOpcode: PPC::CMPLW, CmpPred: PPC::PRED_GT);
14103 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
14104 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0, CmpOpcode: PPC::CMPLD, CmpPred: PPC::PRED_GT);
14105
14106 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
14107 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: true, BinOpcode: 0);
14108 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
14109 BB = EmitPartwordAtomicBinary(MI, BB, is8bit: false, BinOpcode: 0);
14110 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
14111 BB = EmitAtomicBinary(MI, BB, AtomicSize: 4, BinOpcode: 0);
14112 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
14113 BB = EmitAtomicBinary(MI, BB, AtomicSize: 8, BinOpcode: 0);
14114 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
14115 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
14116 (Subtarget.hasPartwordAtomics() &&
14117 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
14118 (Subtarget.hasPartwordAtomics() &&
14119 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
14120 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
14121
14122 auto LoadMnemonic = PPC::LDARX;
14123 auto StoreMnemonic = PPC::STDCX;
14124 switch (MI.getOpcode()) {
14125 default:
14126 llvm_unreachable("Compare and swap of unknown size");
14127 case PPC::ATOMIC_CMP_SWAP_I8:
14128 LoadMnemonic = PPC::LBARX;
14129 StoreMnemonic = PPC::STBCX;
14130 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
14131 break;
14132 case PPC::ATOMIC_CMP_SWAP_I16:
14133 LoadMnemonic = PPC::LHARX;
14134 StoreMnemonic = PPC::STHCX;
14135 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
14136 break;
14137 case PPC::ATOMIC_CMP_SWAP_I32:
14138 LoadMnemonic = PPC::LWARX;
14139 StoreMnemonic = PPC::STWCX;
14140 break;
14141 case PPC::ATOMIC_CMP_SWAP_I64:
14142 LoadMnemonic = PPC::LDARX;
14143 StoreMnemonic = PPC::STDCX;
14144 break;
14145 }
14146 MachineRegisterInfo &RegInfo = F->getRegInfo();
14147 Register dest = MI.getOperand(i: 0).getReg();
14148 Register ptrA = MI.getOperand(i: 1).getReg();
14149 Register ptrB = MI.getOperand(i: 2).getReg();
14150 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14151 Register oldval = MI.getOperand(i: 3).getReg();
14152 Register newval = MI.getOperand(i: 4).getReg();
14153 DebugLoc dl = MI.getDebugLoc();
14154
14155 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14156 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14157 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14158 F->insert(MBBI: It, MBB: loop1MBB);
14159 F->insert(MBBI: It, MBB: loop2MBB);
14160 F->insert(MBBI: It, MBB: exitMBB);
14161 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
14162 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14163 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14164
14165 // thisMBB:
14166 // ...
14167 // fallthrough --> loopMBB
14168 BB->addSuccessor(Succ: loop1MBB);
14169
14170 // loop1MBB:
14171 // l[bhwd]arx dest, ptr
14172 // cmp[wd] dest, oldval
14173 // bne- exitBB
14174 // loop2MBB:
14175 // st[bhwd]cx. newval, ptr
14176 // bne- loopMBB
14177 // b exitBB
14178 // exitBB:
14179 BB = loop1MBB;
14180 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: LoadMnemonic), DestReg: dest).addReg(RegNo: ptrA).addReg(RegNo: ptrB);
14181 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::CMPD : PPC::CMPW), DestReg: CrReg)
14182 .addReg(RegNo: dest)
14183 .addReg(RegNo: oldval);
14184 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14185 .addImm(Val: PPC::PRED_NE_MINUS)
14186 .addReg(RegNo: CrReg)
14187 .addMBB(MBB: exitMBB);
14188 BB->addSuccessor(Succ: loop2MBB);
14189 BB->addSuccessor(Succ: exitMBB);
14190
14191 BB = loop2MBB;
14192 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: StoreMnemonic))
14193 .addReg(RegNo: newval)
14194 .addReg(RegNo: ptrA)
14195 .addReg(RegNo: ptrB);
14196 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14197 .addImm(Val: PPC::PRED_NE_MINUS)
14198 .addReg(RegNo: PPC::CR0)
14199 .addMBB(MBB: loop1MBB);
14200 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: exitMBB);
14201 BB->addSuccessor(Succ: loop1MBB);
14202 BB->addSuccessor(Succ: exitMBB);
14203
14204 // exitMBB:
14205 // ...
14206 BB = exitMBB;
14207 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
14208 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
14209 // We must use 64-bit registers for addresses when targeting 64-bit,
14210 // since we're actually doing arithmetic on them. Other registers
14211 // can be 32-bit.
14212 bool is64bit = Subtarget.isPPC64();
14213 bool isLittleEndian = Subtarget.isLittleEndian();
14214 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
14215
14216 Register dest = MI.getOperand(i: 0).getReg();
14217 Register ptrA = MI.getOperand(i: 1).getReg();
14218 Register ptrB = MI.getOperand(i: 2).getReg();
14219 Register oldval = MI.getOperand(i: 3).getReg();
14220 Register newval = MI.getOperand(i: 4).getReg();
14221 DebugLoc dl = MI.getDebugLoc();
14222
14223 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14224 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14225 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
14226 F->insert(MBBI: It, MBB: loop1MBB);
14227 F->insert(MBBI: It, MBB: loop2MBB);
14228 F->insert(MBBI: It, MBB: exitMBB);
14229 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
14230 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
14231 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
14232
14233 MachineRegisterInfo &RegInfo = F->getRegInfo();
14234 const TargetRegisterClass *RC =
14235 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
14236 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
14237
14238 Register PtrReg = RegInfo.createVirtualRegister(RegClass: RC);
14239 Register Shift1Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14240 Register ShiftReg =
14241 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RegClass: GPRC);
14242 Register NewVal2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14243 Register NewVal3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14244 Register OldVal2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14245 Register OldVal3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14246 Register MaskReg = RegInfo.createVirtualRegister(RegClass: GPRC);
14247 Register Mask2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14248 Register Mask3Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14249 Register Tmp2Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14250 Register Tmp4Reg = RegInfo.createVirtualRegister(RegClass: GPRC);
14251 Register TmpDestReg = RegInfo.createVirtualRegister(RegClass: GPRC);
14252 Register Ptr1Reg;
14253 Register TmpReg = RegInfo.createVirtualRegister(RegClass: GPRC);
14254 Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
14255 Register CrReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14256 // thisMBB:
14257 // ...
14258 // fallthrough --> loopMBB
14259 BB->addSuccessor(Succ: loop1MBB);
14260
14261 // The 4-byte load must be aligned, while a char or short may be
14262 // anywhere in the word. Hence all this nasty bookkeeping code.
14263 // add ptr1, ptrA, ptrB [copy if ptrA==0]
14264 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
14265 // xori shift, shift1, 24 [16]
14266 // rlwinm ptr, ptr1, 0, 0, 29
14267 // slw newval2, newval, shift
14268 // slw oldval2, oldval,shift
14269 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
14270 // slw mask, mask2, shift
14271 // and newval3, newval2, mask
14272 // and oldval3, oldval2, mask
14273 // loop1MBB:
14274 // lwarx tmpDest, ptr
14275 // and tmp, tmpDest, mask
14276 // cmpw tmp, oldval3
14277 // bne- exitBB
14278 // loop2MBB:
14279 // andc tmp2, tmpDest, mask
14280 // or tmp4, tmp2, newval3
14281 // stwcx. tmp4, ptr
14282 // bne- loop1MBB
14283 // b exitBB
14284 // exitBB:
14285 // srw dest, tmpDest, shift
14286 if (ptrA != ZeroReg) {
14287 Ptr1Reg = RegInfo.createVirtualRegister(RegClass: RC);
14288 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: is64bit ? PPC::ADD8 : PPC::ADD4), DestReg: Ptr1Reg)
14289 .addReg(RegNo: ptrA)
14290 .addReg(RegNo: ptrB);
14291 } else {
14292 Ptr1Reg = ptrB;
14293 }
14294
14295 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
14296 // mode.
14297 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: Shift1Reg)
14298 .addReg(RegNo: Ptr1Reg, Flags: {}, SubReg: is64bit ? PPC::sub_32 : 0)
14299 .addImm(Val: 3)
14300 .addImm(Val: 27)
14301 .addImm(Val: is8bit ? 28 : 27);
14302 if (!isLittleEndian)
14303 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::XORI), DestReg: ShiftReg)
14304 .addReg(RegNo: Shift1Reg)
14305 .addImm(Val: is8bit ? 24 : 16);
14306 if (is64bit)
14307 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDICR), DestReg: PtrReg)
14308 .addReg(RegNo: Ptr1Reg)
14309 .addImm(Val: 0)
14310 .addImm(Val: 61);
14311 else
14312 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::RLWINM), DestReg: PtrReg)
14313 .addReg(RegNo: Ptr1Reg)
14314 .addImm(Val: 0)
14315 .addImm(Val: 0)
14316 .addImm(Val: 29);
14317 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: NewVal2Reg)
14318 .addReg(RegNo: newval)
14319 .addReg(RegNo: ShiftReg);
14320 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: OldVal2Reg)
14321 .addReg(RegNo: oldval)
14322 .addReg(RegNo: ShiftReg);
14323 if (is8bit)
14324 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask2Reg).addImm(Val: 255);
14325 else {
14326 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LI), DestReg: Mask3Reg).addImm(Val: 0);
14327 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ORI), DestReg: Mask2Reg)
14328 .addReg(RegNo: Mask3Reg)
14329 .addImm(Val: 65535);
14330 }
14331 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::SLW), DestReg: MaskReg)
14332 .addReg(RegNo: Mask2Reg)
14333 .addReg(RegNo: ShiftReg);
14334 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: NewVal3Reg)
14335 .addReg(RegNo: NewVal2Reg)
14336 .addReg(RegNo: MaskReg);
14337 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: OldVal3Reg)
14338 .addReg(RegNo: OldVal2Reg)
14339 .addReg(RegNo: MaskReg);
14340
14341 BB = loop1MBB;
14342 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::LWARX), DestReg: TmpDestReg)
14343 .addReg(RegNo: ZeroReg)
14344 .addReg(RegNo: PtrReg);
14345 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::AND), DestReg: TmpReg)
14346 .addReg(RegNo: TmpDestReg)
14347 .addReg(RegNo: MaskReg);
14348 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::CMPW), DestReg: CrReg)
14349 .addReg(RegNo: TmpReg)
14350 .addReg(RegNo: OldVal3Reg);
14351 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14352 .addImm(Val: PPC::PRED_NE)
14353 .addReg(RegNo: CrReg)
14354 .addMBB(MBB: exitMBB);
14355 BB->addSuccessor(Succ: loop2MBB);
14356 BB->addSuccessor(Succ: exitMBB);
14357
14358 BB = loop2MBB;
14359 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::ANDC), DestReg: Tmp2Reg)
14360 .addReg(RegNo: TmpDestReg)
14361 .addReg(RegNo: MaskReg);
14362 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::OR), DestReg: Tmp4Reg)
14363 .addReg(RegNo: Tmp2Reg)
14364 .addReg(RegNo: NewVal3Reg);
14365 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::STWCX))
14366 .addReg(RegNo: Tmp4Reg)
14367 .addReg(RegNo: ZeroReg)
14368 .addReg(RegNo: PtrReg);
14369 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::BCC))
14370 .addImm(Val: PPC::PRED_NE)
14371 .addReg(RegNo: PPC::CR0)
14372 .addMBB(MBB: loop1MBB);
14373 BuildMI(BB, MIMD: dl, MCID: TII->get(Opcode: PPC::B)).addMBB(MBB: exitMBB);
14374 BB->addSuccessor(Succ: loop1MBB);
14375 BB->addSuccessor(Succ: exitMBB);
14376
14377 // exitMBB:
14378 // ...
14379 BB = exitMBB;
14380 BuildMI(BB&: *BB, I: BB->begin(), MIMD: dl, MCID: TII->get(Opcode: PPC::SRW), DestReg: dest)
14381 .addReg(RegNo: TmpReg)
14382 .addReg(RegNo: ShiftReg);
14383 } else if (MI.getOpcode() == PPC::FADDrtz) {
14384 // This pseudo performs an FADD with rounding mode temporarily forced
14385 // to round-to-zero. We emit this via custom inserter since the FPSCR
14386 // is not modeled at the SelectionDAG level.
14387 Register Dest = MI.getOperand(i: 0).getReg();
14388 Register Src1 = MI.getOperand(i: 1).getReg();
14389 Register Src2 = MI.getOperand(i: 2).getReg();
14390 DebugLoc dl = MI.getDebugLoc();
14391
14392 MachineRegisterInfo &RegInfo = F->getRegInfo();
14393 Register MFFSReg = RegInfo.createVirtualRegister(RegClass: &PPC::F8RCRegClass);
14394
14395 // Save FPSCR value.
14396 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: MFFSReg);
14397
14398 // Set rounding mode to round-to-zero.
14399 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSB1))
14400 .addImm(Val: 31)
14401 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14402
14403 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSB0))
14404 .addImm(Val: 30)
14405 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14406
14407 // Perform addition.
14408 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::FADD), DestReg: Dest)
14409 .addReg(RegNo: Src1)
14410 .addReg(RegNo: Src2);
14411 if (MI.getFlag(Flag: MachineInstr::NoFPExcept))
14412 MIB.setMIFlag(MachineInstr::NoFPExcept);
14413
14414 // Restore FPSCR value.
14415 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSFb)).addImm(Val: 1).addReg(RegNo: MFFSReg);
14416 } else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
14417 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||
14418 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
14419 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
14420 unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
14421 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
14422 ? PPC::ANDI8_rec
14423 : PPC::ANDI_rec;
14424 bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
14425 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
14426
14427 MachineRegisterInfo &RegInfo = F->getRegInfo();
14428 Register Dest = RegInfo.createVirtualRegister(
14429 RegClass: Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
14430
14431 DebugLoc Dl = MI.getDebugLoc();
14432 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode), DestReg: Dest)
14433 .addReg(RegNo: MI.getOperand(i: 1).getReg())
14434 .addImm(Val: 1);
14435 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14436 DestReg: MI.getOperand(i: 0).getReg())
14437 .addReg(RegNo: IsEQ ? PPC::CR0EQ : PPC::CR0GT);
14438 } else if (MI.getOpcode() == PPC::TCHECK_RET) {
14439 DebugLoc Dl = MI.getDebugLoc();
14440 MachineRegisterInfo &RegInfo = F->getRegInfo();
14441 Register CRReg = RegInfo.createVirtualRegister(RegClass: &PPC::CRRCRegClass);
14442 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::TCHECK), DestReg: CRReg);
14443 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14444 DestReg: MI.getOperand(i: 0).getReg())
14445 .addReg(RegNo: CRReg);
14446 } else if (MI.getOpcode() == PPC::TBEGIN_RET) {
14447 DebugLoc Dl = MI.getDebugLoc();
14448 unsigned Imm = MI.getOperand(i: 1).getImm();
14449 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::TBEGIN)).addImm(Val: Imm);
14450 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::COPY),
14451 DestReg: MI.getOperand(i: 0).getReg())
14452 .addReg(RegNo: PPC::CR0EQ);
14453 } else if (MI.getOpcode() == PPC::SETRNDi) {
14454 DebugLoc dl = MI.getDebugLoc();
14455 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14456
14457 // Save FPSCR value.
14458 if (MRI.use_empty(RegNo: OldFPSCRReg))
14459 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: OldFPSCRReg);
14460 else
14461 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14462
14463 // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
14464 // the following settings:
14465 // 00 Round to nearest
14466 // 01 Round to 0
14467 // 10 Round to +inf
14468 // 11 Round to -inf
14469
14470 // When the operand is immediate, using the two least significant bits of
14471 // the immediate to set the bits 62:63 of FPSCR.
14472 unsigned Mode = MI.getOperand(i: 1).getImm();
14473 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: (Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
14474 .addImm(Val: 31)
14475 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14476
14477 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: (Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
14478 .addImm(Val: 30)
14479 .addReg(RegNo: PPC::RM, Flags: RegState::ImplicitDefine);
14480 } else if (MI.getOpcode() == PPC::SETRND) {
14481 DebugLoc dl = MI.getDebugLoc();
14482
14483 // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
14484 // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
14485 // If the target doesn't have DirectMove, we should use stack to do the
14486 // conversion, because the target doesn't have the instructions like mtvsrd
14487 // or mfvsrd to do this conversion directly.
14488 auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
14489 if (Subtarget.hasDirectMove()) {
14490 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg)
14491 .addReg(RegNo: SrcReg);
14492 } else {
14493 // Use stack to do the register copy.
14494 unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
14495 MachineRegisterInfo &RegInfo = F->getRegInfo();
14496 const TargetRegisterClass *RC = RegInfo.getRegClass(Reg: SrcReg);
14497 if (RC == &PPC::F8RCRegClass) {
14498 // Copy register from F8RCRegClass to G8RCRegclass.
14499 assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
14500 "Unsupported RegClass.");
14501
14502 StoreOp = PPC::STFD;
14503 LoadOp = PPC::LD;
14504 } else {
14505 // Copy register from G8RCRegClass to F8RCRegclass.
14506 assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
14507 (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
14508 "Unsupported RegClass.");
14509 }
14510
14511 MachineFrameInfo &MFI = F->getFrameInfo();
14512 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(8), isSpillSlot: false);
14513
14514 MachineMemOperand *MMOStore = F->getMachineMemOperand(
14515 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *F, FI: FrameIdx, Offset: 0),
14516 F: MachineMemOperand::MOStore, Size: MFI.getObjectSize(ObjectIdx: FrameIdx),
14517 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIdx));
14518
14519 // Store the SrcReg into the stack.
14520 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: StoreOp))
14521 .addReg(RegNo: SrcReg)
14522 .addImm(Val: 0)
14523 .addFrameIndex(Idx: FrameIdx)
14524 .addMemOperand(MMO: MMOStore);
14525
14526 MachineMemOperand *MMOLoad = F->getMachineMemOperand(
14527 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *F, FI: FrameIdx, Offset: 0),
14528 F: MachineMemOperand::MOLoad, Size: MFI.getObjectSize(ObjectIdx: FrameIdx),
14529 BaseAlignment: MFI.getObjectAlign(ObjectIdx: FrameIdx));
14530
14531 // Load from the stack where SrcReg is stored, and save to DestReg,
14532 // so we have done the RegClass conversion from RegClass::SrcReg to
14533 // RegClass::DestReg.
14534 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: LoadOp), DestReg)
14535 .addImm(Val: 0)
14536 .addFrameIndex(Idx: FrameIdx)
14537 .addMemOperand(MMO: MMOLoad);
14538 }
14539 };
14540
14541 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14542
14543 // Save FPSCR value.
14544 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14545
14546 // When the operand is gprc register, use two least significant bits of the
14547 // register and mtfsf instruction to set the bits 62:63 of FPSCR.
14548 //
14549 // copy OldFPSCRTmpReg, OldFPSCRReg
14550 // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
14551 // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
14552 // copy NewFPSCRReg, NewFPSCRTmpReg
14553 // mtfsf 255, NewFPSCRReg
14554 MachineOperand SrcOp = MI.getOperand(i: 1);
14555 MachineRegisterInfo &RegInfo = F->getRegInfo();
14556 Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14557
14558 copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
14559
14560 Register ImDefReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14561 Register ExtSrcReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14562
14563 // The first operand of INSERT_SUBREG should be a register which has
14564 // subregisters, we only care about its RegClass, so we should use an
14565 // IMPLICIT_DEF register.
14566 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: ImDefReg);
14567 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::INSERT_SUBREG), DestReg: ExtSrcReg)
14568 .addReg(RegNo: ImDefReg)
14569 .add(MO: SrcOp)
14570 .addImm(Val: 1);
14571
14572 Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14573 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::RLDIMI), DestReg: NewFPSCRTmpReg)
14574 .addReg(RegNo: OldFPSCRTmpReg)
14575 .addReg(RegNo: ExtSrcReg)
14576 .addImm(Val: 0)
14577 .addImm(Val: 62);
14578
14579 Register NewFPSCRReg = RegInfo.createVirtualRegister(RegClass: &PPC::F8RCRegClass);
14580 copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
14581
14582 // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
14583 // bits of FPSCR.
14584 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: PPC::MTFSF))
14585 .addImm(Val: 255)
14586 .addReg(RegNo: NewFPSCRReg)
14587 .addImm(Val: 0)
14588 .addImm(Val: 0);
14589 } else if (MI.getOpcode() == PPC::SETFLM) {
14590 DebugLoc Dl = MI.getDebugLoc();
14591
14592 // Result of setflm is previous FPSCR content, so we need to save it first.
14593 Register OldFPSCRReg = MI.getOperand(i: 0).getReg();
14594 if (MRI.use_empty(RegNo: OldFPSCRReg))
14595 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: OldFPSCRReg);
14596 else
14597 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::MFFS), DestReg: OldFPSCRReg);
14598
14599 // Put bits in 32:63 to FPSCR.
14600 Register NewFPSCRReg = MI.getOperand(i: 1).getReg();
14601 BuildMI(BB&: *BB, I&: MI, MIMD: Dl, MCID: TII->get(Opcode: PPC::MTFSF))
14602 .addImm(Val: 255)
14603 .addReg(RegNo: NewFPSCRReg)
14604 .addImm(Val: 0)
14605 .addImm(Val: 0);
14606 } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
14607 MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
14608 return emitProbedAlloca(MI, MBB: BB);
14609 } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {
14610 DebugLoc DL = MI.getDebugLoc();
14611 Register Src = MI.getOperand(i: 2).getReg();
14612 Register Lo = MI.getOperand(i: 0).getReg();
14613 Register Hi = MI.getOperand(i: 1).getReg();
14614 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY))
14615 .addDef(RegNo: Lo)
14616 .addUse(RegNo: Src, Flags: {}, SubReg: PPC::sub_gp8_x1);
14617 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY))
14618 .addDef(RegNo: Hi)
14619 .addUse(RegNo: Src, Flags: {}, SubReg: PPC::sub_gp8_x0);
14620 } else if (MI.getOpcode() == PPC::LQX_PSEUDO ||
14621 MI.getOpcode() == PPC::STQX_PSEUDO) {
14622 DebugLoc DL = MI.getDebugLoc();
14623 // Ptr is used as the ptr_rc_no_r0 part
14624 // of LQ/STQ's memory operand and adding result of RA and RB,
14625 // so it has to be g8rc_and_g8rc_nox0.
14626 Register Ptr =
14627 F->getRegInfo().createVirtualRegister(RegClass: &PPC::G8RC_and_G8RC_NOX0RegClass);
14628 Register Val = MI.getOperand(i: 0).getReg();
14629 Register RA = MI.getOperand(i: 1).getReg();
14630 Register RB = MI.getOperand(i: 2).getReg();
14631 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::ADD8), DestReg: Ptr).addReg(RegNo: RA).addReg(RegNo: RB);
14632 BuildMI(BB&: *BB, I&: MI, MIMD: DL,
14633 MCID: MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(Opcode: PPC::LQ)
14634 : TII->get(Opcode: PPC::STQ))
14635 .addReg(RegNo: Val, Flags: getDefRegState(B: MI.getOpcode() == PPC::LQX_PSEUDO))
14636 .addImm(Val: 0)
14637 .addReg(RegNo: Ptr);
14638 } else if (MI.getOpcode() == PPC::LWAT_PSEUDO ||
14639 MI.getOpcode() == PPC::LDAT_PSEUDO) {
14640 DebugLoc DL = MI.getDebugLoc();
14641 Register DstReg = MI.getOperand(i: 0).getReg();
14642 Register PtrReg = MI.getOperand(i: 1).getReg();
14643 Register ValReg = MI.getOperand(i: 2).getReg();
14644 unsigned FC = MI.getOperand(i: 3).getImm();
14645 bool IsLwat = MI.getOpcode() == PPC::LWAT_PSEUDO;
14646 Register Val64 = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14647 if (IsLwat)
14648 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::SUBREG_TO_REG), DestReg: Val64)
14649 .addImm(Val: 0)
14650 .addReg(RegNo: ValReg)
14651 .addImm(Val: PPC::sub_32);
14652 else
14653 Val64 = ValReg;
14654
14655 Register G8rPair = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14656 Register UndefG8r = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14657 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: UndefG8r);
14658 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: PPC::REG_SEQUENCE), DestReg: G8rPair)
14659 .addReg(RegNo: UndefG8r)
14660 .addImm(Val: PPC::sub_gp8_x0)
14661 .addReg(RegNo: Val64)
14662 .addImm(Val: PPC::sub_gp8_x1);
14663
14664 Register PairResult = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14665 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: IsLwat ? PPC::LWAT : PPC::LDAT), DestReg: PairResult)
14666 .addReg(RegNo: G8rPair)
14667 .addReg(RegNo: PtrReg)
14668 .addImm(Val: FC);
14669 Register Result64 = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14670 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: Result64)
14671 .addReg(RegNo: PairResult, Flags: {}, SubReg: PPC::sub_gp8_x0);
14672 if (IsLwat)
14673 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14674 .addReg(RegNo: Result64, Flags: {}, SubReg: PPC::sub_32);
14675 else
14676 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14677 .addReg(RegNo: Result64);
14678 } else if (MI.getOpcode() == PPC::LWAT_COND_PSEUDO ||
14679 MI.getOpcode() == PPC::LDAT_COND_PSEUDO) {
14680 DebugLoc DL = MI.getDebugLoc();
14681 Register DstReg = MI.getOperand(i: 0).getReg();
14682 Register PtrReg = MI.getOperand(i: 1).getReg();
14683 unsigned FC = MI.getOperand(i: 2).getImm();
14684 bool IsLwat_Cond = MI.getOpcode() == PPC::LWAT_COND_PSEUDO;
14685
14686 Register Pair = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14687 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: Pair);
14688
14689 Register PairResult = MRI.createVirtualRegister(RegClass: &PPC::G8pRCRegClass);
14690 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: IsLwat_Cond ? PPC::LWAT : PPC::LDAT),
14691 DestReg: PairResult)
14692 .addReg(RegNo: Pair)
14693 .addReg(RegNo: PtrReg)
14694 .addImm(Val: FC);
14695 Register Result64 = MRI.createVirtualRegister(RegClass: &PPC::G8RCRegClass);
14696 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: Result64)
14697 .addReg(RegNo: PairResult, Flags: {}, SubReg: PPC::sub_gp8_x0);
14698 if (IsLwat_Cond)
14699 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14700 .addReg(RegNo: Result64, Flags: {}, SubReg: PPC::sub_32);
14701 else
14702 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
14703 .addReg(RegNo: Result64);
14704 } else {
14705 llvm_unreachable("Unexpected instr type to insert");
14706 }
14707
14708 MI.eraseFromParent(); // The pseudo instruction is gone now.
14709 return BB;
14710}
14711
14712//===----------------------------------------------------------------------===//
14713// Target Optimization Hooks
14714//===----------------------------------------------------------------------===//
14715
14716static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
14717 // For the estimates, convergence is quadratic, so we essentially double the
14718 // number of digits correct after every iteration. For both FRE and FRSQRTE,
14719 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
14720 // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
14721 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
14722 if (VT.getScalarType() == MVT::f64)
14723 RefinementSteps++;
14724 return RefinementSteps;
14725}
14726
14727SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
14728 const DenormalMode &Mode) const {
14729 // We only have VSX Vector Test for software Square Root.
14730 EVT VT = Op.getValueType();
14731 if (!isTypeLegal(VT: MVT::i1) ||
14732 (VT != MVT::f64 &&
14733 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
14734 return TargetLowering::getSqrtInputTest(Operand: Op, DAG, Mode);
14735
14736 SDLoc DL(Op);
14737 // The output register of FTSQRT is CR field.
14738 SDValue FTSQRT = DAG.getNode(Opcode: PPCISD::FTSQRT, DL, VT: MVT::i32, Operand: Op);
14739 // ftsqrt BF,FRB
14740 // Let e_b be the unbiased exponent of the double-precision
14741 // floating-point operand in register FRB.
14742 // fe_flag is set to 1 if either of the following conditions occurs.
14743 // - The double-precision floating-point operand in register FRB is a zero,
14744 // a NaN, or an infinity, or a negative value.
14745 // - e_b is less than or equal to -970.
14746 // Otherwise fe_flag is set to 0.
14747 // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
14748 // not eligible for iteration. (zero/negative/infinity/nan or unbiased
14749 // exponent is less than -970)
14750 SDValue SRIdxVal = DAG.getTargetConstant(Val: PPC::sub_eq, DL, VT: MVT::i32);
14751 return SDValue(DAG.getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: DL, VT: MVT::i1,
14752 Op1: FTSQRT, Op2: SRIdxVal),
14753 0);
14754}
14755
14756SDValue
14757PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
14758 SelectionDAG &DAG) const {
14759 // We only have VSX Vector Square Root.
14760 EVT VT = Op.getValueType();
14761 if (VT != MVT::f64 &&
14762 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
14763 return TargetLowering::getSqrtResultForDenormInput(Operand: Op, DAG);
14764
14765 return DAG.getNode(Opcode: PPCISD::FSQRT, DL: SDLoc(Op), VT, Operand: Op);
14766}
14767
14768SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
14769 int Enabled, int &RefinementSteps,
14770 bool &UseOneConstNR,
14771 bool Reciprocal) const {
14772 EVT VT = Operand.getValueType();
14773 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
14774 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
14775 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
14776 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
14777 if (RefinementSteps == ReciprocalEstimate::Unspecified)
14778 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
14779
14780 // The Newton-Raphson computation with a single constant does not provide
14781 // enough accuracy on some CPUs.
14782 UseOneConstNR = !Subtarget.needsTwoConstNR();
14783 return DAG.getNode(Opcode: PPCISD::FRSQRTE, DL: SDLoc(Operand), VT, Operand);
14784 }
14785 return SDValue();
14786}
14787
14788SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
14789 int Enabled,
14790 int &RefinementSteps) const {
14791 EVT VT = Operand.getValueType();
14792 if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
14793 (VT == MVT::f64 && Subtarget.hasFRE()) ||
14794 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
14795 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
14796 if (RefinementSteps == ReciprocalEstimate::Unspecified)
14797 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
14798 return DAG.getNode(Opcode: PPCISD::FRE, DL: SDLoc(Operand), VT, Operand);
14799 }
14800 return SDValue();
14801}
14802
14803unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
14804 // Note: This functionality is used only when arcp is enabled, and
14805 // on cores with reciprocal estimates (which are used when arcp is
14806 // enabled for division), this functionality is redundant with the default
14807 // combiner logic (once the division -> reciprocal/multiply transformation
14808 // has taken place). As a result, this matters more for older cores than for
14809 // newer ones.
14810
14811 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
14812 // reciprocal if there are two or more FDIVs (for embedded cores with only
14813 // one FP pipeline) for three or more FDIVs (for generic OOO cores).
14814 switch (Subtarget.getCPUDirective()) {
14815 default:
14816 return 3;
14817 case PPC::DIR_440:
14818 case PPC::DIR_A2:
14819 case PPC::DIR_E500:
14820 case PPC::DIR_E500mc:
14821 case PPC::DIR_E5500:
14822 return 2;
14823 }
14824}
14825
14826// isConsecutiveLSLoc needs to work even if all adds have not yet been
14827// collapsed, and so we need to look through chains of them.
14828static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
14829 int64_t& Offset, SelectionDAG &DAG) {
14830 if (DAG.isBaseWithConstantOffset(Op: Loc)) {
14831 Base = Loc.getOperand(i: 0);
14832 Offset += cast<ConstantSDNode>(Val: Loc.getOperand(i: 1))->getSExtValue();
14833
14834 // The base might itself be a base plus an offset, and if so, accumulate
14835 // that as well.
14836 getBaseWithConstantOffset(Loc: Loc.getOperand(i: 0), Base, Offset, DAG);
14837 }
14838}
14839
14840static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
14841 unsigned Bytes, int Dist,
14842 SelectionDAG &DAG) {
14843 if (VT.getSizeInBits() / 8 != Bytes)
14844 return false;
14845
14846 SDValue BaseLoc = Base->getBasePtr();
14847 if (Loc.getOpcode() == ISD::FrameIndex) {
14848 if (BaseLoc.getOpcode() != ISD::FrameIndex)
14849 return false;
14850 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14851 int FI = cast<FrameIndexSDNode>(Val&: Loc)->getIndex();
14852 int BFI = cast<FrameIndexSDNode>(Val&: BaseLoc)->getIndex();
14853 int FS = MFI.getObjectSize(ObjectIdx: FI);
14854 int BFS = MFI.getObjectSize(ObjectIdx: BFI);
14855 if (FS != BFS || FS != (int)Bytes) return false;
14856 return MFI.getObjectOffset(ObjectIdx: FI) == (MFI.getObjectOffset(ObjectIdx: BFI) + Dist*Bytes);
14857 }
14858
14859 SDValue Base1 = Loc, Base2 = BaseLoc;
14860 int64_t Offset1 = 0, Offset2 = 0;
14861 getBaseWithConstantOffset(Loc, Base&: Base1, Offset&: Offset1, DAG);
14862 getBaseWithConstantOffset(Loc: BaseLoc, Base&: Base2, Offset&: Offset2, DAG);
14863 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
14864 return true;
14865
14866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14867 const GlobalValue *GV1 = nullptr;
14868 const GlobalValue *GV2 = nullptr;
14869 Offset1 = 0;
14870 Offset2 = 0;
14871 bool isGA1 = TLI.isGAPlusOffset(N: Loc.getNode(), GA&: GV1, Offset&: Offset1);
14872 bool isGA2 = TLI.isGAPlusOffset(N: BaseLoc.getNode(), GA&: GV2, Offset&: Offset2);
14873 if (isGA1 && isGA2 && GV1 == GV2)
14874 return Offset1 == (Offset2 + Dist*Bytes);
14875 return false;
14876}
14877
14878// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
14879// not enforce equality of the chain operands.
14880static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
14881 unsigned Bytes, int Dist,
14882 SelectionDAG &DAG) {
14883 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Val: N)) {
14884 EVT VT = LS->getMemoryVT();
14885 SDValue Loc = LS->getBasePtr();
14886 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
14887 }
14888
14889 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
14890 EVT VT;
14891 switch (N->getConstantOperandVal(Num: 1)) {
14892 default: return false;
14893 case Intrinsic::ppc_altivec_lvx:
14894 case Intrinsic::ppc_altivec_lvxl:
14895 case Intrinsic::ppc_vsx_lxvw4x:
14896 case Intrinsic::ppc_vsx_lxvw4x_be:
14897 VT = MVT::v4i32;
14898 break;
14899 case Intrinsic::ppc_vsx_lxvd2x:
14900 case Intrinsic::ppc_vsx_lxvd2x_be:
14901 VT = MVT::v2f64;
14902 break;
14903 case Intrinsic::ppc_altivec_lvebx:
14904 VT = MVT::i8;
14905 break;
14906 case Intrinsic::ppc_altivec_lvehx:
14907 VT = MVT::i16;
14908 break;
14909 case Intrinsic::ppc_altivec_lvewx:
14910 VT = MVT::i32;
14911 break;
14912 }
14913
14914 return isConsecutiveLSLoc(Loc: N->getOperand(Num: 2), VT, Base, Bytes, Dist, DAG);
14915 }
14916
14917 if (N->getOpcode() == ISD::INTRINSIC_VOID) {
14918 EVT VT;
14919 switch (N->getConstantOperandVal(Num: 1)) {
14920 default: return false;
14921 case Intrinsic::ppc_altivec_stvx:
14922 case Intrinsic::ppc_altivec_stvxl:
14923 case Intrinsic::ppc_vsx_stxvw4x:
14924 VT = MVT::v4i32;
14925 break;
14926 case Intrinsic::ppc_vsx_stxvd2x:
14927 VT = MVT::v2f64;
14928 break;
14929 case Intrinsic::ppc_vsx_stxvw4x_be:
14930 VT = MVT::v4i32;
14931 break;
14932 case Intrinsic::ppc_vsx_stxvd2x_be:
14933 VT = MVT::v2f64;
14934 break;
14935 case Intrinsic::ppc_altivec_stvebx:
14936 VT = MVT::i8;
14937 break;
14938 case Intrinsic::ppc_altivec_stvehx:
14939 VT = MVT::i16;
14940 break;
14941 case Intrinsic::ppc_altivec_stvewx:
14942 VT = MVT::i32;
14943 break;
14944 }
14945
14946 return isConsecutiveLSLoc(Loc: N->getOperand(Num: 3), VT, Base, Bytes, Dist, DAG);
14947 }
14948
14949 return false;
14950}
14951
14952// Return true is there is a nearyby consecutive load to the one provided
14953// (regardless of alignment). We search up and down the chain, looking though
14954// token factors and other loads (but nothing else). As a result, a true result
14955// indicates that it is safe to create a new consecutive load adjacent to the
14956// load provided.
14957static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
14958 SDValue Chain = LD->getChain();
14959 EVT VT = LD->getMemoryVT();
14960
14961 SmallPtrSet<SDNode *, 16> LoadRoots;
14962 SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
14963 SmallPtrSet<SDNode *, 16> Visited;
14964
14965 // First, search up the chain, branching to follow all token-factor operands.
14966 // If we find a consecutive load, then we're done, otherwise, record all
14967 // nodes just above the top-level loads and token factors.
14968 while (!Queue.empty()) {
14969 SDNode *ChainNext = Queue.pop_back_val();
14970 if (!Visited.insert(Ptr: ChainNext).second)
14971 continue;
14972
14973 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(Val: ChainNext)) {
14974 if (isConsecutiveLS(N: ChainLD, Base: LD, Bytes: VT.getStoreSize(), Dist: 1, DAG))
14975 return true;
14976
14977 if (!Visited.count(Ptr: ChainLD->getChain().getNode()))
14978 Queue.push_back(Elt: ChainLD->getChain().getNode());
14979 } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
14980 for (const SDUse &O : ChainNext->ops())
14981 if (!Visited.count(Ptr: O.getNode()))
14982 Queue.push_back(Elt: O.getNode());
14983 } else
14984 LoadRoots.insert(Ptr: ChainNext);
14985 }
14986
14987 // Second, search down the chain, starting from the top-level nodes recorded
14988 // in the first phase. These top-level nodes are the nodes just above all
14989 // loads and token factors. Starting with their uses, recursively look though
14990 // all loads (just the chain uses) and token factors to find a consecutive
14991 // load.
14992 Visited.clear();
14993 Queue.clear();
14994
14995 for (SDNode *I : LoadRoots) {
14996 Queue.push_back(Elt: I);
14997
14998 while (!Queue.empty()) {
14999 SDNode *LoadRoot = Queue.pop_back_val();
15000 if (!Visited.insert(Ptr: LoadRoot).second)
15001 continue;
15002
15003 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(Val: LoadRoot))
15004 if (isConsecutiveLS(N: ChainLD, Base: LD, Bytes: VT.getStoreSize(), Dist: 1, DAG))
15005 return true;
15006
15007 for (SDNode *U : LoadRoot->users())
15008 if (((isa<MemSDNode>(Val: U) &&
15009 cast<MemSDNode>(Val: U)->getChain().getNode() == LoadRoot) ||
15010 U->getOpcode() == ISD::TokenFactor) &&
15011 !Visited.count(Ptr: U))
15012 Queue.push_back(Elt: U);
15013 }
15014 }
15015
15016 return false;
15017}
15018
15019/// This function is called when we have proved that a SETCC node can be replaced
15020/// by subtraction (and other supporting instructions) so that the result of
15021/// comparison is kept in a GPR instead of CR. This function is purely for
15022/// codegen purposes and has some flags to guide the codegen process.
15023static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
15024 bool Swap, SDLoc &DL, SelectionDAG &DAG) {
15025 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
15026
15027 // Zero extend the operands to the largest legal integer. Originally, they
15028 // must be of a strictly smaller size.
15029 auto Op0 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, N1: N->getOperand(Num: 0),
15030 N2: DAG.getConstant(Val: Size, DL, VT: MVT::i32));
15031 auto Op1 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, N1: N->getOperand(Num: 1),
15032 N2: DAG.getConstant(Val: Size, DL, VT: MVT::i32));
15033
15034 // Swap if needed. Depends on the condition code.
15035 if (Swap)
15036 std::swap(a&: Op0, b&: Op1);
15037
15038 // Subtract extended integers.
15039 auto SubNode = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Op0, N2: Op1);
15040
15041 // Move the sign bit to the least significant position and zero out the rest.
15042 // Now the least significant bit carries the result of original comparison.
15043 auto Shifted = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SubNode,
15044 N2: DAG.getConstant(Val: Size - 1, DL, VT: MVT::i32));
15045 auto Final = Shifted;
15046
15047 // Complement the result if needed. Based on the condition code.
15048 if (Complement)
15049 Final = DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64, N1: Shifted,
15050 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
15051
15052 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Final);
15053}
15054
15055SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
15056 DAGCombinerInfo &DCI) const {
15057 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
15058
15059 SelectionDAG &DAG = DCI.DAG;
15060 SDLoc DL(N);
15061
15062 // Size of integers being compared has a critical role in the following
15063 // analysis, so we prefer to do this when all types are legal.
15064 if (!DCI.isAfterLegalizeDAG())
15065 return SDValue();
15066
15067 // If all users of SETCC extend its value to a legal integer type
15068 // then we replace SETCC with a subtraction
15069 for (const SDNode *U : N->users())
15070 if (U->getOpcode() != ISD::ZERO_EXTEND)
15071 return SDValue();
15072
15073 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15074 auto OpSize = N->getOperand(Num: 0).getValueSizeInBits();
15075
15076 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
15077
15078 if (OpSize < Size) {
15079 switch (CC) {
15080 default: break;
15081 case ISD::SETULT:
15082 return generateEquivalentSub(N, Size, Complement: false, Swap: false, DL, DAG);
15083 case ISD::SETULE:
15084 return generateEquivalentSub(N, Size, Complement: true, Swap: true, DL, DAG);
15085 case ISD::SETUGT:
15086 return generateEquivalentSub(N, Size, Complement: false, Swap: true, DL, DAG);
15087 case ISD::SETUGE:
15088 return generateEquivalentSub(N, Size, Complement: true, Swap: false, DL, DAG);
15089 }
15090 }
15091
15092 return SDValue();
15093}
15094
15095SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
15096 DAGCombinerInfo &DCI) const {
15097 SelectionDAG &DAG = DCI.DAG;
15098 SDLoc dl(N);
15099
15100 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
15101 // If we're tracking CR bits, we need to be careful that we don't have:
15102 // trunc(binary-ops(zext(x), zext(y)))
15103 // or
15104 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
15105 // such that we're unnecessarily moving things into GPRs when it would be
15106 // better to keep them in CR bits.
15107
15108 // Note that trunc here can be an actual i1 trunc, or can be the effective
15109 // truncation that comes from a setcc or select_cc.
15110 if (N->getOpcode() == ISD::TRUNCATE &&
15111 N->getValueType(ResNo: 0) != MVT::i1)
15112 return SDValue();
15113
15114 if (N->getOperand(Num: 0).getValueType() != MVT::i32 &&
15115 N->getOperand(Num: 0).getValueType() != MVT::i64)
15116 return SDValue();
15117
15118 if (N->getOpcode() == ISD::SETCC ||
15119 N->getOpcode() == ISD::SELECT_CC) {
15120 // If we're looking at a comparison, then we need to make sure that the
15121 // high bits (all except for the first) don't matter the result.
15122 ISD::CondCode CC =
15123 cast<CondCodeSDNode>(Val: N->getOperand(
15124 Num: N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
15125 unsigned OpBits = N->getOperand(Num: 0).getValueSizeInBits();
15126
15127 if (ISD::isSignedIntSetCC(Code: CC)) {
15128 if (DAG.ComputeNumSignBits(Op: N->getOperand(Num: 0)) != OpBits ||
15129 DAG.ComputeNumSignBits(Op: N->getOperand(Num: 1)) != OpBits)
15130 return SDValue();
15131 } else if (ISD::isUnsignedIntSetCC(Code: CC)) {
15132 if (!DAG.MaskedValueIsZero(Op: N->getOperand(Num: 0),
15133 Mask: APInt::getHighBitsSet(numBits: OpBits, hiBitsSet: OpBits-1)) ||
15134 !DAG.MaskedValueIsZero(Op: N->getOperand(Num: 1),
15135 Mask: APInt::getHighBitsSet(numBits: OpBits, hiBitsSet: OpBits-1)))
15136 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
15137 : SDValue());
15138 } else {
15139 // This is neither a signed nor an unsigned comparison, just make sure
15140 // that the high bits are equal.
15141 KnownBits Op1Known = DAG.computeKnownBits(Op: N->getOperand(Num: 0));
15142 KnownBits Op2Known = DAG.computeKnownBits(Op: N->getOperand(Num: 1));
15143
15144 // We don't really care about what is known about the first bit (if
15145 // anything), so pretend that it is known zero for both to ensure they can
15146 // be compared as constants.
15147 Op1Known.Zero.setBit(0); Op1Known.One.clearBit(BitPosition: 0);
15148 Op2Known.Zero.setBit(0); Op2Known.One.clearBit(BitPosition: 0);
15149
15150 if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
15151 Op1Known.getConstant() != Op2Known.getConstant())
15152 return SDValue();
15153 }
15154 }
15155
15156 // We now know that the higher-order bits are irrelevant, we just need to
15157 // make sure that all of the intermediate operations are bit operations, and
15158 // all inputs are extensions.
15159 if (N->getOperand(Num: 0).getOpcode() != ISD::AND &&
15160 N->getOperand(Num: 0).getOpcode() != ISD::OR &&
15161 N->getOperand(Num: 0).getOpcode() != ISD::XOR &&
15162 N->getOperand(Num: 0).getOpcode() != ISD::SELECT &&
15163 N->getOperand(Num: 0).getOpcode() != ISD::SELECT_CC &&
15164 N->getOperand(Num: 0).getOpcode() != ISD::TRUNCATE &&
15165 N->getOperand(Num: 0).getOpcode() != ISD::SIGN_EXTEND &&
15166 N->getOperand(Num: 0).getOpcode() != ISD::ZERO_EXTEND &&
15167 N->getOperand(Num: 0).getOpcode() != ISD::ANY_EXTEND)
15168 return SDValue();
15169
15170 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
15171 N->getOperand(Num: 1).getOpcode() != ISD::AND &&
15172 N->getOperand(Num: 1).getOpcode() != ISD::OR &&
15173 N->getOperand(Num: 1).getOpcode() != ISD::XOR &&
15174 N->getOperand(Num: 1).getOpcode() != ISD::SELECT &&
15175 N->getOperand(Num: 1).getOpcode() != ISD::SELECT_CC &&
15176 N->getOperand(Num: 1).getOpcode() != ISD::TRUNCATE &&
15177 N->getOperand(Num: 1).getOpcode() != ISD::SIGN_EXTEND &&
15178 N->getOperand(Num: 1).getOpcode() != ISD::ZERO_EXTEND &&
15179 N->getOperand(Num: 1).getOpcode() != ISD::ANY_EXTEND)
15180 return SDValue();
15181
15182 SmallVector<SDValue, 4> Inputs;
15183 SmallVector<SDValue, 8> BinOps, PromOps;
15184 SmallPtrSet<SDNode *, 16> Visited;
15185
15186 for (unsigned i = 0; i < 2; ++i) {
15187 if (((N->getOperand(Num: i).getOpcode() == ISD::SIGN_EXTEND ||
15188 N->getOperand(Num: i).getOpcode() == ISD::ZERO_EXTEND ||
15189 N->getOperand(Num: i).getOpcode() == ISD::ANY_EXTEND) &&
15190 N->getOperand(Num: i).getOperand(i: 0).getValueType() == MVT::i1) ||
15191 isa<ConstantSDNode>(Val: N->getOperand(Num: i)))
15192 Inputs.push_back(Elt: N->getOperand(Num: i));
15193 else
15194 BinOps.push_back(Elt: N->getOperand(Num: i));
15195
15196 if (N->getOpcode() == ISD::TRUNCATE)
15197 break;
15198 }
15199
15200 // Visit all inputs, collect all binary operations (and, or, xor and
15201 // select) that are all fed by extensions.
15202 while (!BinOps.empty()) {
15203 SDValue BinOp = BinOps.pop_back_val();
15204
15205 if (!Visited.insert(Ptr: BinOp.getNode()).second)
15206 continue;
15207
15208 PromOps.push_back(Elt: BinOp);
15209
15210 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
15211 // The condition of the select is not promoted.
15212 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
15213 continue;
15214 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
15215 continue;
15216
15217 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
15218 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
15219 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
15220 BinOp.getOperand(i).getOperand(i: 0).getValueType() == MVT::i1) ||
15221 isa<ConstantSDNode>(Val: BinOp.getOperand(i))) {
15222 Inputs.push_back(Elt: BinOp.getOperand(i));
15223 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
15224 BinOp.getOperand(i).getOpcode() == ISD::OR ||
15225 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
15226 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
15227 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
15228 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
15229 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
15230 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
15231 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
15232 BinOps.push_back(Elt: BinOp.getOperand(i));
15233 } else {
15234 // We have an input that is not an extension or another binary
15235 // operation; we'll abort this transformation.
15236 return SDValue();
15237 }
15238 }
15239 }
15240
15241 // Make sure that this is a self-contained cluster of operations (which
15242 // is not quite the same thing as saying that everything has only one
15243 // use).
15244 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15245 if (isa<ConstantSDNode>(Val: Inputs[i]))
15246 continue;
15247
15248 for (const SDNode *User : Inputs[i].getNode()->users()) {
15249 if (User != N && !Visited.count(Ptr: User))
15250 return SDValue();
15251
15252 // Make sure that we're not going to promote the non-output-value
15253 // operand(s) or SELECT or SELECT_CC.
15254 // FIXME: Although we could sometimes handle this, and it does occur in
15255 // practice that one of the condition inputs to the select is also one of
15256 // the outputs, we currently can't deal with this.
15257 if (User->getOpcode() == ISD::SELECT) {
15258 if (User->getOperand(Num: 0) == Inputs[i])
15259 return SDValue();
15260 } else if (User->getOpcode() == ISD::SELECT_CC) {
15261 if (User->getOperand(Num: 0) == Inputs[i] ||
15262 User->getOperand(Num: 1) == Inputs[i])
15263 return SDValue();
15264 }
15265 }
15266 }
15267
15268 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
15269 for (const SDNode *User : PromOps[i].getNode()->users()) {
15270 if (User != N && !Visited.count(Ptr: User))
15271 return SDValue();
15272
15273 // Make sure that we're not going to promote the non-output-value
15274 // operand(s) or SELECT or SELECT_CC.
15275 // FIXME: Although we could sometimes handle this, and it does occur in
15276 // practice that one of the condition inputs to the select is also one of
15277 // the outputs, we currently can't deal with this.
15278 if (User->getOpcode() == ISD::SELECT) {
15279 if (User->getOperand(Num: 0) == PromOps[i])
15280 return SDValue();
15281 } else if (User->getOpcode() == ISD::SELECT_CC) {
15282 if (User->getOperand(Num: 0) == PromOps[i] ||
15283 User->getOperand(Num: 1) == PromOps[i])
15284 return SDValue();
15285 }
15286 }
15287 }
15288
15289 // Replace all inputs with the extension operand.
15290 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15291 // Constants may have users outside the cluster of to-be-promoted nodes,
15292 // and so we need to replace those as we do the promotions.
15293 if (isa<ConstantSDNode>(Val: Inputs[i]))
15294 continue;
15295 else
15296 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i], To: Inputs[i].getOperand(i: 0));
15297 }
15298
15299 std::list<HandleSDNode> PromOpHandles;
15300 for (auto &PromOp : PromOps)
15301 PromOpHandles.emplace_back(args&: PromOp);
15302
15303 // Replace all operations (these are all the same, but have a different
15304 // (i1) return type). DAG.getNode will validate that the types of
15305 // a binary operator match, so go through the list in reverse so that
15306 // we've likely promoted both operands first. Any intermediate truncations or
15307 // extensions disappear.
15308 while (!PromOpHandles.empty()) {
15309 SDValue PromOp = PromOpHandles.back().getValue();
15310 PromOpHandles.pop_back();
15311
15312 if (PromOp.getOpcode() == ISD::TRUNCATE ||
15313 PromOp.getOpcode() == ISD::SIGN_EXTEND ||
15314 PromOp.getOpcode() == ISD::ZERO_EXTEND ||
15315 PromOp.getOpcode() == ISD::ANY_EXTEND) {
15316 if (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: 0)) &&
15317 PromOp.getOperand(i: 0).getValueType() != MVT::i1) {
15318 // The operand is not yet ready (see comment below).
15319 PromOpHandles.emplace_front(args&: PromOp);
15320 continue;
15321 }
15322
15323 SDValue RepValue = PromOp.getOperand(i: 0);
15324 if (isa<ConstantSDNode>(Val: RepValue))
15325 RepValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: RepValue);
15326
15327 DAG.ReplaceAllUsesOfValueWith(From: PromOp, To: RepValue);
15328 continue;
15329 }
15330
15331 unsigned C;
15332 switch (PromOp.getOpcode()) {
15333 default: C = 0; break;
15334 case ISD::SELECT: C = 1; break;
15335 case ISD::SELECT_CC: C = 2; break;
15336 }
15337
15338 if ((!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C)) &&
15339 PromOp.getOperand(i: C).getValueType() != MVT::i1) ||
15340 (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C+1)) &&
15341 PromOp.getOperand(i: C+1).getValueType() != MVT::i1)) {
15342 // The to-be-promoted operands of this node have not yet been
15343 // promoted (this should be rare because we're going through the
15344 // list backward, but if one of the operands has several users in
15345 // this cluster of to-be-promoted nodes, it is possible).
15346 PromOpHandles.emplace_front(args&: PromOp);
15347 continue;
15348 }
15349
15350 SmallVector<SDValue, 3> Ops(PromOp.getNode()->ops());
15351
15352 // If there are any constant inputs, make sure they're replaced now.
15353 for (unsigned i = 0; i < 2; ++i)
15354 if (isa<ConstantSDNode>(Val: Ops[C+i]))
15355 Ops[C+i] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: Ops[C+i]);
15356
15357 DAG.ReplaceAllUsesOfValueWith(From: PromOp,
15358 To: DAG.getNode(Opcode: PromOp.getOpcode(), DL: dl, VT: MVT::i1, Ops));
15359 }
15360
15361 // Now we're left with the initial truncation itself.
15362 if (N->getOpcode() == ISD::TRUNCATE)
15363 return N->getOperand(Num: 0);
15364
15365 // Otherwise, this is a comparison. The operands to be compared have just
15366 // changed type (to i1), but everything else is the same.
15367 return SDValue(N, 0);
15368}
15369
15370SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
15371 DAGCombinerInfo &DCI) const {
15372 SelectionDAG &DAG = DCI.DAG;
15373 SDLoc dl(N);
15374
15375 // If we're tracking CR bits, we need to be careful that we don't have:
15376 // zext(binary-ops(trunc(x), trunc(y)))
15377 // or
15378 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
15379 // such that we're unnecessarily moving things into CR bits that can more
15380 // efficiently stay in GPRs. Note that if we're not certain that the high
15381 // bits are set as required by the final extension, we still may need to do
15382 // some masking to get the proper behavior.
15383
15384 // This same functionality is important on PPC64 when dealing with
15385 // 32-to-64-bit extensions; these occur often when 32-bit values are used as
15386 // the return values of functions. Because it is so similar, it is handled
15387 // here as well.
15388
15389 if (N->getValueType(ResNo: 0) != MVT::i32 &&
15390 N->getValueType(ResNo: 0) != MVT::i64)
15391 return SDValue();
15392
15393 if (!((N->getOperand(Num: 0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
15394 (N->getOperand(Num: 0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
15395 return SDValue();
15396
15397 if (N->getOperand(Num: 0).getOpcode() != ISD::AND &&
15398 N->getOperand(Num: 0).getOpcode() != ISD::OR &&
15399 N->getOperand(Num: 0).getOpcode() != ISD::XOR &&
15400 N->getOperand(Num: 0).getOpcode() != ISD::SELECT &&
15401 N->getOperand(Num: 0).getOpcode() != ISD::SELECT_CC)
15402 return SDValue();
15403
15404 SmallVector<SDValue, 4> Inputs;
15405 SmallVector<SDValue, 8> BinOps(1, N->getOperand(Num: 0)), PromOps;
15406 SmallPtrSet<SDNode *, 16> Visited;
15407
15408 // Visit all inputs, collect all binary operations (and, or, xor and
15409 // select) that are all fed by truncations.
15410 while (!BinOps.empty()) {
15411 SDValue BinOp = BinOps.pop_back_val();
15412
15413 if (!Visited.insert(Ptr: BinOp.getNode()).second)
15414 continue;
15415
15416 PromOps.push_back(Elt: BinOp);
15417
15418 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
15419 // The condition of the select is not promoted.
15420 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
15421 continue;
15422 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
15423 continue;
15424
15425 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
15426 isa<ConstantSDNode>(Val: BinOp.getOperand(i))) {
15427 Inputs.push_back(Elt: BinOp.getOperand(i));
15428 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
15429 BinOp.getOperand(i).getOpcode() == ISD::OR ||
15430 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
15431 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
15432 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
15433 BinOps.push_back(Elt: BinOp.getOperand(i));
15434 } else {
15435 // We have an input that is not a truncation or another binary
15436 // operation; we'll abort this transformation.
15437 return SDValue();
15438 }
15439 }
15440 }
15441
15442 // The operands of a select that must be truncated when the select is
15443 // promoted because the operand is actually part of the to-be-promoted set.
15444 DenseMap<SDNode *, EVT> SelectTruncOp[2];
15445
15446 // Make sure that this is a self-contained cluster of operations (which
15447 // is not quite the same thing as saying that everything has only one
15448 // use).
15449 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15450 if (isa<ConstantSDNode>(Val: Inputs[i]))
15451 continue;
15452
15453 for (SDNode *User : Inputs[i].getNode()->users()) {
15454 if (User != N && !Visited.count(Ptr: User))
15455 return SDValue();
15456
15457 // If we're going to promote the non-output-value operand(s) or SELECT or
15458 // SELECT_CC, record them for truncation.
15459 if (User->getOpcode() == ISD::SELECT) {
15460 if (User->getOperand(Num: 0) == Inputs[i])
15461 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15462 y: User->getOperand(Num: 0).getValueType()));
15463 } else if (User->getOpcode() == ISD::SELECT_CC) {
15464 if (User->getOperand(Num: 0) == Inputs[i])
15465 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15466 y: User->getOperand(Num: 0).getValueType()));
15467 if (User->getOperand(Num: 1) == Inputs[i])
15468 SelectTruncOp[1].insert(KV: std::make_pair(x&: User,
15469 y: User->getOperand(Num: 1).getValueType()));
15470 }
15471 }
15472 }
15473
15474 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
15475 for (SDNode *User : PromOps[i].getNode()->users()) {
15476 if (User != N && !Visited.count(Ptr: User))
15477 return SDValue();
15478
15479 // If we're going to promote the non-output-value operand(s) or SELECT or
15480 // SELECT_CC, record them for truncation.
15481 if (User->getOpcode() == ISD::SELECT) {
15482 if (User->getOperand(Num: 0) == PromOps[i])
15483 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15484 y: User->getOperand(Num: 0).getValueType()));
15485 } else if (User->getOpcode() == ISD::SELECT_CC) {
15486 if (User->getOperand(Num: 0) == PromOps[i])
15487 SelectTruncOp[0].insert(KV: std::make_pair(x&: User,
15488 y: User->getOperand(Num: 0).getValueType()));
15489 if (User->getOperand(Num: 1) == PromOps[i])
15490 SelectTruncOp[1].insert(KV: std::make_pair(x&: User,
15491 y: User->getOperand(Num: 1).getValueType()));
15492 }
15493 }
15494 }
15495
15496 unsigned PromBits = N->getOperand(Num: 0).getValueSizeInBits();
15497 bool ReallyNeedsExt = false;
15498 if (N->getOpcode() != ISD::ANY_EXTEND) {
15499 // If all of the inputs are not already sign/zero extended, then
15500 // we'll still need to do that at the end.
15501 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15502 if (isa<ConstantSDNode>(Val: Inputs[i]))
15503 continue;
15504
15505 unsigned OpBits =
15506 Inputs[i].getOperand(i: 0).getValueSizeInBits();
15507 assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
15508
15509 if ((N->getOpcode() == ISD::ZERO_EXTEND &&
15510 !DAG.MaskedValueIsZero(Op: Inputs[i].getOperand(i: 0),
15511 Mask: APInt::getHighBitsSet(numBits: OpBits,
15512 hiBitsSet: OpBits-PromBits))) ||
15513 (N->getOpcode() == ISD::SIGN_EXTEND &&
15514 DAG.ComputeNumSignBits(Op: Inputs[i].getOperand(i: 0)) <
15515 (OpBits-(PromBits-1)))) {
15516 ReallyNeedsExt = true;
15517 break;
15518 }
15519 }
15520 }
15521
15522 // Convert PromOps to handles before doing any RAUW operations, as these
15523 // may CSE with existing nodes, deleting the originals.
15524 std::list<HandleSDNode> PromOpHandles;
15525 for (auto &PromOp : PromOps)
15526 PromOpHandles.emplace_back(args&: PromOp);
15527
15528 // Replace all inputs, either with the truncation operand, or a
15529 // truncation or extension to the final output type.
15530 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
15531 // Constant inputs need to be replaced with the to-be-promoted nodes that
15532 // use them because they might have users outside of the cluster of
15533 // promoted nodes.
15534 if (isa<ConstantSDNode>(Val: Inputs[i]))
15535 continue;
15536
15537 SDValue InSrc = Inputs[i].getOperand(i: 0);
15538 if (Inputs[i].getValueType() == N->getValueType(ResNo: 0))
15539 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i], To: InSrc);
15540 else if (N->getOpcode() == ISD::SIGN_EXTEND)
15541 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15542 To: DAG.getSExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15543 else if (N->getOpcode() == ISD::ZERO_EXTEND)
15544 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15545 To: DAG.getZExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15546 else
15547 DAG.ReplaceAllUsesOfValueWith(From: Inputs[i],
15548 To: DAG.getAnyExtOrTrunc(Op: InSrc, DL: dl, VT: N->getValueType(ResNo: 0)));
15549 }
15550
15551 // Replace all operations (these are all the same, but have a different
15552 // (promoted) return type). DAG.getNode will validate that the types of
15553 // a binary operator match, so go through the list in reverse so that
15554 // we've likely promoted both operands first.
15555 while (!PromOpHandles.empty()) {
15556 SDValue PromOp = PromOpHandles.back().getValue();
15557 PromOpHandles.pop_back();
15558
15559 unsigned C;
15560 switch (PromOp.getOpcode()) {
15561 default: C = 0; break;
15562 case ISD::SELECT: C = 1; break;
15563 case ISD::SELECT_CC: C = 2; break;
15564 }
15565
15566 if ((!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C)) &&
15567 PromOp.getOperand(i: C).getValueType() != N->getValueType(ResNo: 0)) ||
15568 (!isa<ConstantSDNode>(Val: PromOp.getOperand(i: C+1)) &&
15569 PromOp.getOperand(i: C+1).getValueType() != N->getValueType(ResNo: 0))) {
15570 // The to-be-promoted operands of this node have not yet been
15571 // promoted (this should be rare because we're going through the
15572 // list backward, but if one of the operands has several users in
15573 // this cluster of to-be-promoted nodes, it is possible).
15574 PromOpHandles.emplace_front(args&: PromOp);
15575 continue;
15576 }
15577
15578 // For SELECT and SELECT_CC nodes, we do a similar check for any
15579 // to-be-promoted comparison inputs.
15580 if (PromOp.getOpcode() == ISD::SELECT ||
15581 PromOp.getOpcode() == ISD::SELECT_CC) {
15582 if ((SelectTruncOp[0].count(Val: PromOp.getNode()) &&
15583 PromOp.getOperand(i: 0).getValueType() != N->getValueType(ResNo: 0)) ||
15584 (SelectTruncOp[1].count(Val: PromOp.getNode()) &&
15585 PromOp.getOperand(i: 1).getValueType() != N->getValueType(ResNo: 0))) {
15586 PromOpHandles.emplace_front(args&: PromOp);
15587 continue;
15588 }
15589 }
15590
15591 SmallVector<SDValue, 3> Ops(PromOp.getNode()->ops());
15592
15593 // If this node has constant inputs, then they'll need to be promoted here.
15594 for (unsigned i = 0; i < 2; ++i) {
15595 if (!isa<ConstantSDNode>(Val: Ops[C+i]))
15596 continue;
15597 if (Ops[C+i].getValueType() == N->getValueType(ResNo: 0))
15598 continue;
15599
15600 if (N->getOpcode() == ISD::SIGN_EXTEND)
15601 Ops[C+i] = DAG.getSExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15602 else if (N->getOpcode() == ISD::ZERO_EXTEND)
15603 Ops[C+i] = DAG.getZExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15604 else
15605 Ops[C+i] = DAG.getAnyExtOrTrunc(Op: Ops[C+i], DL: dl, VT: N->getValueType(ResNo: 0));
15606 }
15607
15608 // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
15609 // truncate them again to the original value type.
15610 if (PromOp.getOpcode() == ISD::SELECT ||
15611 PromOp.getOpcode() == ISD::SELECT_CC) {
15612 auto SI0 = SelectTruncOp[0].find(Val: PromOp.getNode());
15613 if (SI0 != SelectTruncOp[0].end())
15614 Ops[0] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: SI0->second, Operand: Ops[0]);
15615 auto SI1 = SelectTruncOp[1].find(Val: PromOp.getNode());
15616 if (SI1 != SelectTruncOp[1].end())
15617 Ops[1] = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: SI1->second, Operand: Ops[1]);
15618 }
15619
15620 DAG.ReplaceAllUsesOfValueWith(From: PromOp,
15621 To: DAG.getNode(Opcode: PromOp.getOpcode(), DL: dl, VT: N->getValueType(ResNo: 0), Ops));
15622 }
15623
15624 // Now we're left with the initial extension itself.
15625 if (!ReallyNeedsExt)
15626 return N->getOperand(Num: 0);
15627
15628 // To zero extend, just mask off everything except for the first bit (in the
15629 // i1 case).
15630 if (N->getOpcode() == ISD::ZERO_EXTEND)
15631 return DAG.getNode(Opcode: ISD::AND, DL: dl, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
15632 N2: DAG.getConstant(Val: APInt::getLowBitsSet(
15633 numBits: N->getValueSizeInBits(ResNo: 0), loBitsSet: PromBits),
15634 DL: dl, VT: N->getValueType(ResNo: 0)));
15635
15636 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
15637 "Invalid extension type");
15638 EVT ShiftAmountTy = getShiftAmountTy(LHSTy: N->getValueType(ResNo: 0), DL: DAG.getDataLayout());
15639 SDValue ShiftCst =
15640 DAG.getConstant(Val: N->getValueSizeInBits(ResNo: 0) - PromBits, DL: dl, VT: ShiftAmountTy);
15641 return DAG.getNode(
15642 Opcode: ISD::SRA, DL: dl, VT: N->getValueType(ResNo: 0),
15643 N1: DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0), N2: ShiftCst),
15644 N2: ShiftCst);
15645}
15646
15647// The function check a i128 load can convert to 16i8 load for Vcmpequb.
15648static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS) {
15649
15650 auto isValidForConvert = [](SDValue &Operand) {
15651 if (!Operand.hasOneUse())
15652 return false;
15653
15654 if (Operand.getValueType() != MVT::i128)
15655 return false;
15656
15657 if (Operand.getOpcode() == ISD::Constant)
15658 return true;
15659
15660 auto *LoadNode = dyn_cast<LoadSDNode>(Val&: Operand);
15661 if (!LoadNode)
15662 return false;
15663
15664 // If memory operation is volatile, do not perform any
15665 // optimization or transformation. Volatile operations must be preserved
15666 // as written to ensure correct program behavior, so we return an empty
15667 // SDValue to indicate no action.
15668
15669 if (LoadNode->isVolatile())
15670 return false;
15671
15672 // Only combine loads if both use the unindexed addressing mode.
15673 // PowerPC AltiVec/VMX does not support vector loads or stores with
15674 // pre/post-increment addressing. Indexed modes may imply implicit
15675 // pointer updates, which are not compatible with AltiVec vector
15676 // instructions.
15677 if (LoadNode->getAddressingMode() != ISD::UNINDEXED)
15678 return false;
15679
15680 // Only combine loads if both are non-extending loads
15681 // (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or
15682 // ISD::SEXTLOAD) perform zero or sign extension, which may change the
15683 // loaded value's semantics and are not compatible with vector loads.
15684 if (LoadNode->getExtensionType() != ISD::NON_EXTLOAD)
15685 return false;
15686
15687 return true;
15688 };
15689
15690 return (isValidForConvert(LHS) && isValidForConvert(RHS));
15691}
15692
15693SDValue convertTwoLoadsAndCmpToVCMPEQUB(SelectionDAG &DAG, SDNode *N,
15694 const SDLoc &DL) {
15695
15696 assert(N->getOpcode() == ISD::SETCC && "Should be called with a SETCC node");
15697
15698 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15699 assert((CC == ISD::SETNE || CC == ISD::SETEQ) &&
15700 "CC mus be ISD::SETNE or ISD::SETEQ");
15701
15702 auto getV16i8Load = [&](const SDValue &Operand) {
15703 if (Operand.getOpcode() == ISD::Constant)
15704 return DAG.getBitcast(VT: MVT::v16i8, V: Operand);
15705
15706 assert(Operand.getOpcode() == ISD::LOAD && "Must be LoadSDNode here.");
15707
15708 auto *LoadNode = cast<LoadSDNode>(Val: Operand);
15709 return DAG.getLoad(VT: MVT::v16i8, dl: DL, Chain: LoadNode->getChain(),
15710 Ptr: LoadNode->getBasePtr(), MMO: LoadNode->getMemOperand());
15711 };
15712
15713 // Following code transforms the DAG
15714 // t0: ch,glue = EntryToken
15715 // t2: i64,ch = CopyFromReg t0, Register:i64 %0
15716 // t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
15717 // undef:i64
15718 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
15719 // t5: i128,ch =
15720 // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 =
15721 // setcc t3, t5, setne:ch
15722 //
15723 // ---->
15724 //
15725 // t0: ch,glue = EntryToken
15726 // t2: i64,ch = CopyFromReg t0, Register:i64 %0
15727 // t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
15728 // undef:i64
15729 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
15730 // t5: v16i8,ch =
15731 // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64
15732 // t6: i32 =
15733 // llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>,
15734 // Constant:i32<2>, t3, t5
15735 // t7: i1 = setcc t6, Constant:i32<0>, seteq:ch
15736
15737 // Or transforms the DAG
15738 // t5: i128,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
15739 // t8: i1 =
15740 // setcc Constant:i128<237684487579686500932345921536>, t5, setne:ch
15741 //
15742 // --->
15743 //
15744 // t5: v16i8,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
15745 // t6: v16i8 = bitcast Constant:i128<237684487579686500932345921536>
15746 // t7: i32 =
15747 // llvm.ppc.altivec.vcmpequb.p Constant:i32<10962>, Constant:i32<2>, t5, t2
15748
15749 SDValue LHSVec = getV16i8Load(N->getOperand(Num: 0));
15750 SDValue RHSVec = getV16i8Load(N->getOperand(Num: 1));
15751
15752 SDValue IntrID =
15753 DAG.getConstant(Val: Intrinsic::ppc_altivec_vcmpequb_p, DL, VT: MVT::i32);
15754 SDValue CRSel = DAG.getConstant(Val: 2, DL, VT: MVT::i32); // which CR6 predicate field
15755 SDValue PredResult = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
15756 N1: IntrID, N2: CRSel, N3: LHSVec, N4: RHSVec);
15757 // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
15758 // so we need to invert the CC opcode.
15759 return DAG.getSetCC(DL, VT: N->getValueType(ResNo: 0), LHS: PredResult,
15760 RHS: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
15761 Cond: CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
15762}
15763
15764// Detect whether there is a pattern like (setcc (and X, 1), 0, eq).
15765// If it is , return true; otherwise return false.
15766static bool canConvertSETCCToXori(SDNode *N) {
15767 assert(N->getOpcode() == ISD::SETCC && "Should be SETCC SDNode here.");
15768
15769 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15770 if (CC != ISD::SETEQ)
15771 return false;
15772
15773 SDValue LHS = N->getOperand(Num: 0);
15774 SDValue RHS = N->getOperand(Num: 1);
15775
15776 // Check the `SDValue &V` is from `and` with `1`.
15777 auto IsAndWithOne = [](SDValue &V) {
15778 if (V.getOpcode() == ISD::AND) {
15779 for (const SDValue &Op : V->ops())
15780 if (auto *C = dyn_cast<ConstantSDNode>(Val: Op))
15781 if (C->isOne())
15782 return true;
15783 }
15784 return false;
15785 };
15786
15787 // Check whether the SETCC compare with zero.
15788 auto IsCompareWithZero = [](SDValue &V) {
15789 if (auto *C = dyn_cast<ConstantSDNode>(Val&: V))
15790 if (C->isZero())
15791 return true;
15792 return false;
15793 };
15794
15795 return (IsAndWithOne(LHS) && IsCompareWithZero(RHS)) ||
15796 (IsAndWithOne(RHS) && IsCompareWithZero(LHS));
15797}
15798
15799// You must check whether the `SDNode* N` can be converted to Xori using
15800// the function `static bool canConvertSETCCToXori(SDNode *N)`
15801// before calling the function; otherwise, it may produce incorrect results.
15802static SDValue ConvertSETCCToXori(SDNode *N, SelectionDAG &DAG) {
15803
15804 assert(N->getOpcode() == ISD::SETCC && "Should be SETCC SDNode here.");
15805 SDValue LHS = N->getOperand(Num: 0);
15806 SDValue RHS = N->getOperand(Num: 1);
15807 SDLoc DL(N);
15808
15809 [[maybe_unused]] ISD::CondCode CC =
15810 cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15811 assert((CC == ISD::SETEQ) && "CC must be ISD::SETEQ.");
15812 // Rewrite it as XORI (and X, 1), 1.
15813 auto MakeXor1 = [&](SDValue V) {
15814 EVT VT = V.getValueType();
15815 SDValue One = DAG.getConstant(Val: 1, DL, VT);
15816 SDValue Xor = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: V, N2: One);
15817 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Xor);
15818 };
15819
15820 if (LHS.getOpcode() == ISD::AND && RHS.getOpcode() != ISD::AND)
15821 return MakeXor1(LHS);
15822
15823 if (RHS.getOpcode() == ISD::AND && LHS.getOpcode() != ISD::AND)
15824 return MakeXor1(RHS);
15825
15826 llvm_unreachable("Should not reach here.");
15827}
15828
15829SDValue PPCTargetLowering::combineSetCC(SDNode *N,
15830 DAGCombinerInfo &DCI) const {
15831 assert(N->getOpcode() == ISD::SETCC &&
15832 "Should be called with a SETCC node");
15833
15834 // Check if the pattern (setcc (and X, 1), 0, eq) is present.
15835 // If it is, rewrite it as XORI (and X, 1), 1.
15836 if (canConvertSETCCToXori(N))
15837 return ConvertSETCCToXori(N, DAG&: DCI.DAG);
15838
15839 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15840 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
15841 SDValue LHS = N->getOperand(Num: 0);
15842 SDValue RHS = N->getOperand(Num: 1);
15843
15844 // If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
15845 if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
15846 LHS.hasOneUse())
15847 std::swap(a&: LHS, b&: RHS);
15848
15849 // x == 0-y --> x+y == 0
15850 // x != 0-y --> x+y != 0
15851 if (RHS.getOpcode() == ISD::SUB && isNullConstant(V: RHS.getOperand(i: 0)) &&
15852 RHS.hasOneUse()) {
15853 SDLoc DL(N);
15854 SelectionDAG &DAG = DCI.DAG;
15855 EVT VT = N->getValueType(ResNo: 0);
15856 EVT OpVT = LHS.getValueType();
15857 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: OpVT, N1: LHS, N2: RHS.getOperand(i: 1));
15858 return DAG.getSetCC(DL, VT, LHS: Add, RHS: DAG.getConstant(Val: 0, DL, VT: OpVT), Cond: CC);
15859 }
15860
15861 // Optimization: Fold i128 equality/inequality compares of two loads into a
15862 // vectorized compare using vcmpequb.p when Altivec is available.
15863 //
15864 // Rationale:
15865 // A scalar i128 SETCC (eq/ne) normally lowers to multiple scalar ops.
15866 // On VSX-capable subtargets, we can instead reinterpret the i128 loads
15867 // as v16i8 vectors and use the Altive vcmpequb.p instruction to
15868 // perform a full 128-bit equality check in a single vector compare.
15869 //
15870 // Example Result:
15871 // This transformation replaces memcmp(a, b, 16) with two vector loads
15872 // and one vector compare instruction.
15873
15874 if (Subtarget.hasAltivec() && canConvertToVcmpequb(LHS, RHS))
15875 return convertTwoLoadsAndCmpToVCMPEQUB(DAG&: DCI.DAG, N, DL: SDLoc(N));
15876 }
15877
15878 return DAGCombineTruncBoolExt(N, DCI);
15879}
15880
15881// Is this an extending load from an f32 to an f64?
15882static bool isFPExtLoad(SDValue Op) {
15883 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: Op.getNode()))
15884 return LD->getExtensionType() == ISD::EXTLOAD &&
15885 Op.getValueType() == MVT::f64;
15886 return false;
15887}
15888
15889/// Reduces the number of fp-to-int conversion when building a vector.
15890///
15891/// If this vector is built out of floating to integer conversions,
15892/// transform it to a vector built out of floating point values followed by a
15893/// single floating to integer conversion of the vector.
15894/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
15895/// becomes (fptosi (build_vector ($A, $B, ...)))
15896SDValue PPCTargetLowering::
15897combineElementTruncationToVectorTruncation(SDNode *N,
15898 DAGCombinerInfo &DCI) const {
15899 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
15900 "Should be called with a BUILD_VECTOR node");
15901
15902 SelectionDAG &DAG = DCI.DAG;
15903 SDLoc dl(N);
15904
15905 SDValue FirstInput = N->getOperand(Num: 0);
15906 assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
15907 "The input operand must be an fp-to-int conversion.");
15908
15909 // This combine happens after legalization so the fp_to_[su]i nodes are
15910 // already converted to PPCSISD nodes.
15911 unsigned FirstConversion = FirstInput.getOperand(i: 0).getOpcode();
15912 if (FirstConversion == PPCISD::FCTIDZ ||
15913 FirstConversion == PPCISD::FCTIDUZ ||
15914 FirstConversion == PPCISD::FCTIWZ ||
15915 FirstConversion == PPCISD::FCTIWUZ) {
15916 bool IsSplat = true;
15917 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
15918 FirstConversion == PPCISD::FCTIWUZ;
15919 EVT SrcVT = FirstInput.getOperand(i: 0).getValueType();
15920 SmallVector<SDValue, 4> Ops;
15921 EVT TargetVT = N->getValueType(ResNo: 0);
15922 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
15923 SDValue NextOp = N->getOperand(Num: i);
15924 if (NextOp.getOpcode() != PPCISD::MFVSR)
15925 return SDValue();
15926 unsigned NextConversion = NextOp.getOperand(i: 0).getOpcode();
15927 if (NextConversion != FirstConversion)
15928 return SDValue();
15929 // If we are converting to 32-bit integers, we need to add an FP_ROUND.
15930 // This is not valid if the input was originally double precision. It is
15931 // also not profitable to do unless this is an extending load in which
15932 // case doing this combine will allow us to combine consecutive loads.
15933 if (Is32Bit && !isFPExtLoad(Op: NextOp.getOperand(i: 0).getOperand(i: 0)))
15934 return SDValue();
15935 if (N->getOperand(Num: i) != FirstInput)
15936 IsSplat = false;
15937 }
15938
15939 // If this is a splat, we leave it as-is since there will be only a single
15940 // fp-to-int conversion followed by a splat of the integer. This is better
15941 // for 32-bit and smaller ints and neutral for 64-bit ints.
15942 if (IsSplat)
15943 return SDValue();
15944
15945 // Now that we know we have the right type of node, get its operands
15946 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
15947 SDValue In = N->getOperand(Num: i).getOperand(i: 0);
15948 if (Is32Bit) {
15949 // For 32-bit values, we need to add an FP_ROUND node (if we made it
15950 // here, we know that all inputs are extending loads so this is safe).
15951 if (In.isUndef())
15952 Ops.push_back(Elt: DAG.getUNDEF(VT: SrcVT));
15953 else {
15954 SDValue Trunc =
15955 DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: In.getOperand(i: 0),
15956 N2: DAG.getIntPtrConstant(Val: 1, DL: dl, /*isTarget=*/true));
15957 Ops.push_back(Elt: Trunc);
15958 }
15959 } else
15960 Ops.push_back(Elt: In.isUndef() ? DAG.getUNDEF(VT: SrcVT) : In.getOperand(i: 0));
15961 }
15962
15963 unsigned Opcode;
15964 if (FirstConversion == PPCISD::FCTIDZ ||
15965 FirstConversion == PPCISD::FCTIWZ)
15966 Opcode = ISD::FP_TO_SINT;
15967 else
15968 Opcode = ISD::FP_TO_UINT;
15969
15970 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
15971 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: dl, Ops);
15972 return DAG.getNode(Opcode, DL: dl, VT: TargetVT, Operand: BV);
15973 }
15974 return SDValue();
15975}
15976
15977// LXVKQ instruction load VSX vector with a special quadword value
15978// based on an immediate value. This helper method returns the details of the
15979// match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
15980// to help generate the LXVKQ instruction and the subsequent shift instruction
15981// required to match the original build vector pattern.
15982
15983// LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
15984using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
15985
15986static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
15987
15988 // LXVKQ instruction loads the Quadword value:
15989 // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
15990 static const APInt BasePattern = APInt(128, 0x8000000000000000ULL) << 64;
15991 static const uint32_t Uim = 16;
15992
15993 // Check for direct LXVKQ match (no shift needed)
15994 if (FullVal == BasePattern)
15995 return std::make_tuple(args: Uim, args: uint8_t{0});
15996
15997 // Check if FullValue is 1 (the result of the base pattern >> 127)
15998 if (FullVal == APInt(128, 1))
15999 return std::make_tuple(args: Uim, args: uint8_t{127});
16000
16001 return std::nullopt;
16002}
16003
16004/// Combine vector loads to a single load (using lxvkq) or splat with shift of a
16005/// constant (xxspltib + vsrq) by recognising patterns in the Build Vector.
16006/// LXVKQ instruction load VSX vector with a special quadword value based on an
16007/// immediate value. if UIM=0b10000 then LXVKQ loads VSR[32×TX+T] with value
16008/// 0x8000_0000_0000_0000_0000_0000_0000_0000.
16009/// This can be used to inline the build vector constants that have the
16010/// following patterns:
16011///
16012/// 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
16013/// 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
16014/// MSB pattern can directly loaded using LXVKQ while LSB is loaded using a
16015/// combination of splatting and right shift instructions.
16016
16017SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
16018 SelectionDAG &DAG) const {
16019
16020 assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
16021 "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
16022
16023 // This transformation is only supported if we are loading either a byte,
16024 // halfword, word, or doubleword.
16025 EVT VT = Op.getValueType();
16026 if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
16027 VT == MVT::v2i64))
16028 return SDValue();
16029
16030 LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
16031 << VT.getEVTString() << "): ";
16032 Op->dump());
16033
16034 unsigned NumElems = VT.getVectorNumElements();
16035 unsigned ElemBits = VT.getScalarSizeInBits();
16036
16037 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
16038
16039 // Check for Non-constant operand in the build vector.
16040 for (const SDValue &Operand : Op.getNode()->op_values()) {
16041 if (!isa<ConstantSDNode>(Val: Operand))
16042 return SDValue();
16043 }
16044
16045 // Assemble build vector operands as a 128-bit register value
16046 // We need to reconstruct what the 128-bit register pattern would be
16047 // that produces this vector when interpreted with the current endianness
16048 APInt FullVal = APInt::getZero(numBits: 128);
16049
16050 for (unsigned Index = 0; Index < NumElems; ++Index) {
16051 auto *C = cast<ConstantSDNode>(Val: Op.getOperand(i: Index));
16052
16053 // Get element value as raw bits (zero-extended)
16054 uint64_t ElemValue = C->getZExtValue();
16055
16056 // Mask to element size to ensure we only get the relevant bits
16057 if (ElemBits < 64)
16058 ElemValue &= ((1ULL << ElemBits) - 1);
16059
16060 // Calculate bit position for this element in the 128-bit register
16061 unsigned BitPos =
16062 (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
16063
16064 // Create APInt for the element value and shift it to correct position
16065 APInt ElemAPInt(128, ElemValue);
16066 ElemAPInt <<= BitPos;
16067
16068 // Place the element value at the correct bit position
16069 FullVal |= ElemAPInt;
16070 }
16071
16072 if (FullVal.isZero() || FullVal.isAllOnes())
16073 return SDValue();
16074
16075 if (auto UIMOpt = getPatternInfo(FullVal)) {
16076 const auto &[Uim, ShiftAmount] = *UIMOpt;
16077 SDLoc Dl(Op);
16078
16079 // Generate LXVKQ instruction if the shift amount is zero.
16080 if (ShiftAmount == 0) {
16081 SDValue UimVal = DAG.getTargetConstant(Val: Uim, DL: Dl, VT: MVT::i32);
16082 SDValue LxvkqInstr =
16083 SDValue(DAG.getMachineNode(Opcode: PPC::LXVKQ, dl: Dl, VT, Op1: UimVal), 0);
16084 LLVM_DEBUG(llvm::dbgs()
16085 << "combineBVLoadsSpecialValue: Instruction Emitted ";
16086 LxvkqInstr.dump());
16087 return LxvkqInstr;
16088 }
16089
16090 assert(ShiftAmount == 127 && "Unexpected lxvkq shift amount value");
16091
16092 // The right shifted pattern can be constructed using a combination of
16093 // XXSPLTIB and VSRQ instruction. VSRQ uses the shift amount from the lower
16094 // 7 bits of byte 15. This can be specified using XXSPLTIB with immediate
16095 // value 255.
16096 SDValue ShiftAmountVec =
16097 SDValue(DAG.getMachineNode(Opcode: PPC::XXSPLTIB, dl: Dl, VT: MVT::v4i32,
16098 Op1: DAG.getTargetConstant(Val: 255, DL: Dl, VT: MVT::i32)),
16099 0);
16100 // Generate appropriate right shift instruction
16101 SDValue ShiftVec = SDValue(
16102 DAG.getMachineNode(Opcode: PPC::VSRQ, dl: Dl, VT, Op1: ShiftAmountVec, Op2: ShiftAmountVec),
16103 0);
16104 LLVM_DEBUG(llvm::dbgs()
16105 << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
16106 ShiftVec.dump());
16107 return ShiftVec;
16108 }
16109 // No patterns matched for build vectors.
16110 return SDValue();
16111}
16112
16113/// Reduce the number of loads when building a vector.
16114///
16115/// Building a vector out of multiple loads can be converted to a load
16116/// of the vector type if the loads are consecutive. If the loads are
16117/// consecutive but in descending order, a shuffle is added at the end
16118/// to reorder the vector.
16119static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
16120 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16121 "Should be called with a BUILD_VECTOR node");
16122
16123 SDLoc dl(N);
16124
16125 // Return early for non byte-sized type, as they can't be consecutive.
16126 if (!N->getValueType(ResNo: 0).getVectorElementType().isByteSized())
16127 return SDValue();
16128
16129 bool InputsAreConsecutiveLoads = true;
16130 bool InputsAreReverseConsecutive = true;
16131 unsigned ElemSize = N->getValueType(ResNo: 0).getScalarType().getStoreSize();
16132 SDValue FirstInput = N->getOperand(Num: 0);
16133 bool IsRoundOfExtLoad = false;
16134 LoadSDNode *FirstLoad = nullptr;
16135
16136 if (FirstInput.getOpcode() == ISD::FP_ROUND &&
16137 FirstInput.getOperand(i: 0).getOpcode() == ISD::LOAD) {
16138 FirstLoad = cast<LoadSDNode>(Val: FirstInput.getOperand(i: 0));
16139 IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD;
16140 }
16141 // Not a build vector of (possibly fp_rounded) loads.
16142 if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
16143 N->getNumOperands() == 1)
16144 return SDValue();
16145
16146 if (!IsRoundOfExtLoad)
16147 FirstLoad = cast<LoadSDNode>(Val&: FirstInput);
16148
16149 SmallVector<LoadSDNode *, 4> InputLoads;
16150 InputLoads.push_back(Elt: FirstLoad);
16151 for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
16152 // If any inputs are fp_round(extload), they all must be.
16153 if (IsRoundOfExtLoad && N->getOperand(Num: i).getOpcode() != ISD::FP_ROUND)
16154 return SDValue();
16155
16156 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(Num: i).getOperand(i: 0) :
16157 N->getOperand(Num: i);
16158 if (NextInput.getOpcode() != ISD::LOAD)
16159 return SDValue();
16160
16161 SDValue PreviousInput =
16162 IsRoundOfExtLoad ? N->getOperand(Num: i-1).getOperand(i: 0) : N->getOperand(Num: i-1);
16163 LoadSDNode *LD1 = cast<LoadSDNode>(Val&: PreviousInput);
16164 LoadSDNode *LD2 = cast<LoadSDNode>(Val&: NextInput);
16165
16166 // If any inputs are fp_round(extload), they all must be.
16167 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
16168 return SDValue();
16169
16170 // We only care about regular loads. The PPC-specific load intrinsics
16171 // will not lead to a merge opportunity.
16172 if (!DAG.areNonVolatileConsecutiveLoads(LD: LD2, Base: LD1, Bytes: ElemSize, Dist: 1))
16173 InputsAreConsecutiveLoads = false;
16174 if (!DAG.areNonVolatileConsecutiveLoads(LD: LD1, Base: LD2, Bytes: ElemSize, Dist: 1))
16175 InputsAreReverseConsecutive = false;
16176
16177 // Exit early if the loads are neither consecutive nor reverse consecutive.
16178 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
16179 return SDValue();
16180 InputLoads.push_back(Elt: LD2);
16181 }
16182
16183 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
16184 "The loads cannot be both consecutive and reverse consecutive.");
16185
16186 SDValue WideLoad;
16187 SDValue ReturnSDVal;
16188 if (InputsAreConsecutiveLoads) {
16189 assert(FirstLoad && "Input needs to be a LoadSDNode.");
16190 WideLoad = DAG.getLoad(VT: N->getValueType(ResNo: 0), dl, Chain: FirstLoad->getChain(),
16191 Ptr: FirstLoad->getBasePtr(), PtrInfo: FirstLoad->getPointerInfo(),
16192 Alignment: FirstLoad->getAlign());
16193 ReturnSDVal = WideLoad;
16194 } else if (InputsAreReverseConsecutive) {
16195 LoadSDNode *LastLoad = InputLoads.back();
16196 assert(LastLoad && "Input needs to be a LoadSDNode.");
16197 WideLoad = DAG.getLoad(VT: N->getValueType(ResNo: 0), dl, Chain: LastLoad->getChain(),
16198 Ptr: LastLoad->getBasePtr(), PtrInfo: LastLoad->getPointerInfo(),
16199 Alignment: LastLoad->getAlign());
16200 SmallVector<int, 16> Ops;
16201 for (int i = N->getNumOperands() - 1; i >= 0; i--)
16202 Ops.push_back(Elt: i);
16203
16204 ReturnSDVal = DAG.getVectorShuffle(VT: N->getValueType(ResNo: 0), dl, N1: WideLoad,
16205 N2: DAG.getUNDEF(VT: N->getValueType(ResNo: 0)), Mask: Ops);
16206 } else
16207 return SDValue();
16208
16209 for (auto *LD : InputLoads)
16210 DAG.makeEquivalentMemoryOrdering(OldLoad: LD, NewMemOp: WideLoad);
16211 return ReturnSDVal;
16212}
16213
16214// This function adds the required vector_shuffle needed to get
16215// the elements of the vector extract in the correct position
16216// as specified by the CorrectElems encoding.
16217static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
16218 SDValue Input, uint64_t Elems,
16219 uint64_t CorrectElems) {
16220 SDLoc dl(N);
16221
16222 unsigned NumElems = Input.getValueType().getVectorNumElements();
16223 SmallVector<int, 16> ShuffleMask(NumElems, -1);
16224
16225 // Knowing the element indices being extracted from the original
16226 // vector and the order in which they're being inserted, just put
16227 // them at element indices required for the instruction.
16228 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16229 if (DAG.getDataLayout().isLittleEndian())
16230 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
16231 else
16232 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
16233 CorrectElems = CorrectElems >> 8;
16234 Elems = Elems >> 8;
16235 }
16236
16237 SDValue Shuffle =
16238 DAG.getVectorShuffle(VT: Input.getValueType(), dl, N1: Input,
16239 N2: DAG.getUNDEF(VT: Input.getValueType()), Mask: ShuffleMask);
16240
16241 EVT VT = N->getValueType(ResNo: 0);
16242 SDValue Conv = DAG.getBitcast(VT, V: Shuffle);
16243
16244 EVT ExtVT = EVT::getVectorVT(Context&: *DAG.getContext(),
16245 VT: Input.getValueType().getVectorElementType(),
16246 NumElements: VT.getVectorNumElements());
16247 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT, N1: Conv,
16248 N2: DAG.getValueType(ExtVT));
16249}
16250
16251// Look for build vector patterns where input operands come from sign
16252// extended vector_extract elements of specific indices. If the correct indices
16253// aren't used, add a vector shuffle to fix up the indices and create
16254// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
16255// during instruction selection.
16256static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
16257 // This array encodes the indices that the vector sign extend instructions
16258 // extract from when extending from one type to another for both BE and LE.
16259 // The right nibble of each byte corresponds to the LE incides.
16260 // and the left nibble of each byte corresponds to the BE incides.
16261 // For example: 0x3074B8FC byte->word
16262 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
16263 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
16264 // For example: 0x000070F8 byte->double word
16265 // For LE: the allowed indices are: 0x0,0x8
16266 // For BE: the allowed indices are: 0x7,0xF
16267 uint64_t TargetElems[] = {
16268 0x3074B8FC, // b->w
16269 0x000070F8, // b->d
16270 0x10325476, // h->w
16271 0x00003074, // h->d
16272 0x00001032, // w->d
16273 };
16274
16275 uint64_t Elems = 0;
16276 int Index;
16277 SDValue Input;
16278
16279 auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
16280 if (!Op)
16281 return false;
16282 if (Op.getOpcode() != ISD::SIGN_EXTEND &&
16283 Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
16284 return false;
16285
16286 // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
16287 // of the right width.
16288 SDValue Extract = Op.getOperand(i: 0);
16289 if (Extract.getOpcode() == ISD::ANY_EXTEND)
16290 Extract = Extract.getOperand(i: 0);
16291 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16292 return false;
16293
16294 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Val: Extract.getOperand(i: 1));
16295 if (!ExtOp)
16296 return false;
16297
16298 Index = ExtOp->getZExtValue();
16299 if (Input && Input != Extract.getOperand(i: 0))
16300 return false;
16301
16302 if (!Input)
16303 Input = Extract.getOperand(i: 0);
16304
16305 Elems = Elems << 8;
16306 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
16307 Elems |= Index;
16308
16309 return true;
16310 };
16311
16312 // If the build vector operands aren't sign extended vector extracts,
16313 // of the same input vector, then return.
16314 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16315 if (!isSExtOfVecExtract(N->getOperand(Num: i))) {
16316 return SDValue();
16317 }
16318 }
16319
16320 // If the vector extract indices are not correct, add the appropriate
16321 // vector_shuffle.
16322 int TgtElemArrayIdx;
16323 int InputSize = Input.getValueType().getScalarSizeInBits();
16324 int OutputSize = N->getValueType(ResNo: 0).getScalarSizeInBits();
16325 if (InputSize + OutputSize == 40)
16326 TgtElemArrayIdx = 0;
16327 else if (InputSize + OutputSize == 72)
16328 TgtElemArrayIdx = 1;
16329 else if (InputSize + OutputSize == 48)
16330 TgtElemArrayIdx = 2;
16331 else if (InputSize + OutputSize == 80)
16332 TgtElemArrayIdx = 3;
16333 else if (InputSize + OutputSize == 96)
16334 TgtElemArrayIdx = 4;
16335 else
16336 return SDValue();
16337
16338 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
16339 CorrectElems = DAG.getDataLayout().isLittleEndian()
16340 ? CorrectElems & 0x0F0F0F0F0F0F0F0F
16341 : CorrectElems & 0xF0F0F0F0F0F0F0F0;
16342 if (Elems != CorrectElems) {
16343 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
16344 }
16345
16346 // Regular lowering will catch cases where a shuffle is not needed.
16347 return SDValue();
16348}
16349
16350// Look for the pattern of a load from a narrow width to i128, feeding
16351// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
16352// (LXVRZX). This node represents a zero extending load that will be matched
16353// to the Load VSX Vector Rightmost instructions.
16354static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
16355 SDLoc DL(N);
16356
16357 // This combine is only eligible for a BUILD_VECTOR of v1i128.
16358 if (N->getValueType(ResNo: 0) != MVT::v1i128)
16359 return SDValue();
16360
16361 SDValue Operand = N->getOperand(Num: 0);
16362 // Proceed with the transformation if the operand to the BUILD_VECTOR
16363 // is a load instruction.
16364 if (Operand.getOpcode() != ISD::LOAD)
16365 return SDValue();
16366
16367 auto *LD = cast<LoadSDNode>(Val&: Operand);
16368 EVT MemoryType = LD->getMemoryVT();
16369
16370 // This transformation is only valid if the we are loading either a byte,
16371 // halfword, word, or doubleword.
16372 bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
16373 MemoryType == MVT::i32 || MemoryType == MVT::i64;
16374
16375 // Ensure that the load from the narrow width is being zero extended to i128.
16376 if (!ValidLDType ||
16377 (LD->getExtensionType() != ISD::ZEXTLOAD &&
16378 LD->getExtensionType() != ISD::EXTLOAD))
16379 return SDValue();
16380
16381 SDValue LoadOps[] = {
16382 LD->getChain(), LD->getBasePtr(),
16383 DAG.getIntPtrConstant(Val: MemoryType.getScalarSizeInBits(), DL)};
16384
16385 return DAG.getMemIntrinsicNode(Opcode: PPCISD::LXVRZX, dl: DL,
16386 VTList: DAG.getVTList(VT1: MVT::v1i128, VT2: MVT::Other),
16387 Ops: LoadOps, MemVT: MemoryType, MMO: LD->getMemOperand());
16388}
16389
16390SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
16391 DAGCombinerInfo &DCI) const {
16392 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
16393 "Should be called with a BUILD_VECTOR node");
16394
16395 SelectionDAG &DAG = DCI.DAG;
16396 SDLoc dl(N);
16397
16398 if (!Subtarget.hasVSX())
16399 return SDValue();
16400
16401 // The target independent DAG combiner will leave a build_vector of
16402 // float-to-int conversions intact. We can generate MUCH better code for
16403 // a float-to-int conversion of a vector of floats.
16404 SDValue FirstInput = N->getOperand(Num: 0);
16405 if (FirstInput.getOpcode() == PPCISD::MFVSR) {
16406 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
16407 if (Reduced)
16408 return Reduced;
16409 }
16410
16411 // If we're building a vector out of consecutive loads, just load that
16412 // vector type.
16413 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
16414 if (Reduced)
16415 return Reduced;
16416
16417 // If we're building a vector out of extended elements from another vector
16418 // we have P9 vector integer extend instructions. The code assumes legal
16419 // input types (i.e. it can't handle things like v4i16) so do not run before
16420 // legalization.
16421 if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
16422 Reduced = combineBVOfVecSExt(N, DAG);
16423 if (Reduced)
16424 return Reduced;
16425 }
16426
16427 // On Power10, the Load VSX Vector Rightmost instructions can be utilized
16428 // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
16429 // is a load from <valid narrow width> to i128.
16430 if (Subtarget.isISA3_1()) {
16431 SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
16432 if (BVOfZLoad)
16433 return BVOfZLoad;
16434 }
16435
16436 if (N->getValueType(ResNo: 0) != MVT::v2f64)
16437 return SDValue();
16438
16439 // Looking for:
16440 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
16441 if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
16442 FirstInput.getOpcode() != ISD::UINT_TO_FP)
16443 return SDValue();
16444 if (N->getOperand(Num: 1).getOpcode() != ISD::SINT_TO_FP &&
16445 N->getOperand(Num: 1).getOpcode() != ISD::UINT_TO_FP)
16446 return SDValue();
16447 if (FirstInput.getOpcode() != N->getOperand(Num: 1).getOpcode())
16448 return SDValue();
16449
16450 SDValue Ext1 = FirstInput.getOperand(i: 0);
16451 SDValue Ext2 = N->getOperand(Num: 1).getOperand(i: 0);
16452 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16453 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16454 return SDValue();
16455
16456 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Val: Ext1.getOperand(i: 1));
16457 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Val: Ext2.getOperand(i: 1));
16458 if (!Ext1Op || !Ext2Op)
16459 return SDValue();
16460 if (Ext1.getOperand(i: 0).getValueType() != MVT::v4i32 ||
16461 Ext1.getOperand(i: 0) != Ext2.getOperand(i: 0))
16462 return SDValue();
16463
16464 int FirstElem = Ext1Op->getZExtValue();
16465 int SecondElem = Ext2Op->getZExtValue();
16466 int SubvecIdx;
16467 if (FirstElem == 0 && SecondElem == 1)
16468 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
16469 else if (FirstElem == 2 && SecondElem == 3)
16470 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
16471 else
16472 return SDValue();
16473
16474 SDValue SrcVec = Ext1.getOperand(i: 0);
16475 auto NodeType = (N->getOperand(Num: 1).getOpcode() == ISD::SINT_TO_FP) ?
16476 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
16477 return DAG.getNode(Opcode: NodeType, DL: dl, VT: MVT::v2f64,
16478 N1: SrcVec, N2: DAG.getIntPtrConstant(Val: SubvecIdx, DL: dl));
16479}
16480
16481SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
16482 DAGCombinerInfo &DCI) const {
16483 assert((N->getOpcode() == ISD::SINT_TO_FP ||
16484 N->getOpcode() == ISD::UINT_TO_FP) &&
16485 "Need an int -> FP conversion node here");
16486
16487 if (useSoftFloat() || !Subtarget.has64BitSupport())
16488 return SDValue();
16489
16490 SelectionDAG &DAG = DCI.DAG;
16491 SDLoc dl(N);
16492 SDValue Op(N, 0);
16493
16494 // Don't handle ppc_fp128 here or conversions that are out-of-range capable
16495 // from the hardware.
16496 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
16497 return SDValue();
16498 if (!Op.getOperand(i: 0).getValueType().isSimple())
16499 return SDValue();
16500 if (Op.getOperand(i: 0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
16501 Op.getOperand(i: 0).getValueType().getSimpleVT() > MVT(MVT::i64))
16502 return SDValue();
16503
16504 SDValue FirstOperand(Op.getOperand(i: 0));
16505 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
16506 (FirstOperand.getValueType() == MVT::i8 ||
16507 FirstOperand.getValueType() == MVT::i16);
16508 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
16509 bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
16510 bool DstDouble = Op.getValueType() == MVT::f64;
16511 unsigned ConvOp = Signed ?
16512 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
16513 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
16514 SDValue WidthConst =
16515 DAG.getIntPtrConstant(Val: FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
16516 DL: dl, isTarget: false);
16517 LoadSDNode *LDN = cast<LoadSDNode>(Val: FirstOperand.getNode());
16518 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
16519 SDValue Ld = DAG.getMemIntrinsicNode(Opcode: PPCISD::LXSIZX, dl,
16520 VTList: DAG.getVTList(VT1: MVT::f64, VT2: MVT::Other),
16521 Ops, MemVT: MVT::i8, MMO: LDN->getMemOperand());
16522 DAG.makeEquivalentMemoryOrdering(OldLoad: LDN, NewMemOp: Ld);
16523
16524 // For signed conversion, we need to sign-extend the value in the VSR
16525 if (Signed) {
16526 SDValue ExtOps[] = { Ld, WidthConst };
16527 SDValue Ext = DAG.getNode(Opcode: PPCISD::VEXTS, DL: dl, VT: MVT::f64, Ops: ExtOps);
16528 return DAG.getNode(Opcode: ConvOp, DL: dl, VT: DstDouble ? MVT::f64 : MVT::f32, Operand: Ext);
16529 } else
16530 return DAG.getNode(Opcode: ConvOp, DL: dl, VT: DstDouble ? MVT::f64 : MVT::f32, Operand: Ld);
16531 }
16532
16533
16534 // For i32 intermediate values, unfortunately, the conversion functions
16535 // leave the upper 32 bits of the value are undefined. Within the set of
16536 // scalar instructions, we have no method for zero- or sign-extending the
16537 // value. Thus, we cannot handle i32 intermediate values here.
16538 if (Op.getOperand(i: 0).getValueType() == MVT::i32)
16539 return SDValue();
16540
16541 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
16542 "UINT_TO_FP is supported only with FPCVT");
16543
16544 // If we have FCFIDS, then use it when converting to single-precision.
16545 // Otherwise, convert to double-precision and then round.
16546 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
16547 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
16548 : PPCISD::FCFIDS)
16549 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
16550 : PPCISD::FCFID);
16551 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
16552 ? MVT::f32
16553 : MVT::f64;
16554
16555 // If we're converting from a float, to an int, and back to a float again,
16556 // then we don't need the store/load pair at all.
16557 if ((Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_UINT &&
16558 Subtarget.hasFPCVT()) ||
16559 (Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_SINT)) {
16560 SDValue Src = Op.getOperand(i: 0).getOperand(i: 0);
16561 if (Src.getValueType() == MVT::f32) {
16562 Src = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f64, Operand: Src);
16563 DCI.AddToWorklist(N: Src.getNode());
16564 } else if (Src.getValueType() != MVT::f64) {
16565 // Make sure that we don't pick up a ppc_fp128 source value.
16566 return SDValue();
16567 }
16568
16569 unsigned FCTOp =
16570 Op.getOperand(i: 0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
16571 PPCISD::FCTIDUZ;
16572
16573 SDValue Tmp = DAG.getNode(Opcode: FCTOp, DL: dl, VT: MVT::f64, Operand: Src);
16574 SDValue FP = DAG.getNode(Opcode: FCFOp, DL: dl, VT: FCFTy, Operand: Tmp);
16575
16576 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
16577 FP = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: MVT::f32, N1: FP,
16578 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
16579 DCI.AddToWorklist(N: FP.getNode());
16580 }
16581
16582 return FP;
16583 }
16584
16585 return SDValue();
16586}
16587
16588// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
16589// builtins) into loads with swaps.
16590SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
16591 DAGCombinerInfo &DCI) const {
16592 // Delay VSX load for LE combine until after LegalizeOps to prioritize other
16593 // load combines.
16594 if (DCI.isBeforeLegalizeOps())
16595 return SDValue();
16596
16597 SelectionDAG &DAG = DCI.DAG;
16598 SDLoc dl(N);
16599 SDValue Chain;
16600 SDValue Base;
16601 MachineMemOperand *MMO;
16602
16603 switch (N->getOpcode()) {
16604 default:
16605 llvm_unreachable("Unexpected opcode for little endian VSX load");
16606 case ISD::LOAD: {
16607 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
16608 Chain = LD->getChain();
16609 Base = LD->getBasePtr();
16610 MMO = LD->getMemOperand();
16611 // If the MMO suggests this isn't a load of a full vector, leave
16612 // things alone. For a built-in, we have to make the change for
16613 // correctness, so if there is a size problem that will be a bug.
16614 if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
16615 return SDValue();
16616 break;
16617 }
16618 case ISD::INTRINSIC_W_CHAIN: {
16619 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(Val: N);
16620 Chain = Intrin->getChain();
16621 // Similarly to the store case below, Intrin->getBasePtr() doesn't get
16622 // us what we want. Get operand 2 instead.
16623 Base = Intrin->getOperand(Num: 2);
16624 MMO = Intrin->getMemOperand();
16625 break;
16626 }
16627 }
16628
16629 MVT VecTy = N->getValueType(ResNo: 0).getSimpleVT();
16630
16631 SDValue LoadOps[] = { Chain, Base };
16632 SDValue Load = DAG.getMemIntrinsicNode(Opcode: PPCISD::LXVD2X, dl,
16633 VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other),
16634 Ops: LoadOps, MemVT: MVT::v2f64, MMO);
16635
16636 DCI.AddToWorklist(N: Load.getNode());
16637 Chain = Load.getValue(R: 1);
16638 SDValue Swap = DAG.getNode(
16639 Opcode: PPCISD::XXSWAPD, DL: dl, VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other), N1: Chain, N2: Load);
16640 DCI.AddToWorklist(N: Swap.getNode());
16641
16642 // Add a bitcast if the resulting load type doesn't match v2f64.
16643 if (VecTy != MVT::v2f64) {
16644 SDValue N = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecTy, Operand: Swap);
16645 DCI.AddToWorklist(N: N.getNode());
16646 // Package {bitcast value, swap's chain} to match Load's shape.
16647 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: DAG.getVTList(VT1: VecTy, VT2: MVT::Other),
16648 N1: N, N2: Swap.getValue(R: 1));
16649 }
16650
16651 return Swap;
16652}
16653
16654// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
16655// builtins) into stores with swaps.
16656SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
16657 DAGCombinerInfo &DCI) const {
16658 // Delay VSX store for LE combine until after LegalizeOps to prioritize other
16659 // store combines.
16660 if (DCI.isBeforeLegalizeOps())
16661 return SDValue();
16662
16663 SelectionDAG &DAG = DCI.DAG;
16664 SDLoc dl(N);
16665 SDValue Chain;
16666 SDValue Base;
16667 unsigned SrcOpnd;
16668 MachineMemOperand *MMO;
16669
16670 switch (N->getOpcode()) {
16671 default:
16672 llvm_unreachable("Unexpected opcode for little endian VSX store");
16673 case ISD::STORE: {
16674 StoreSDNode *ST = cast<StoreSDNode>(Val: N);
16675 Chain = ST->getChain();
16676 Base = ST->getBasePtr();
16677 MMO = ST->getMemOperand();
16678 SrcOpnd = 1;
16679 // If the MMO suggests this isn't a store of a full vector, leave
16680 // things alone. For a built-in, we have to make the change for
16681 // correctness, so if there is a size problem that will be a bug.
16682 if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
16683 return SDValue();
16684 break;
16685 }
16686 case ISD::INTRINSIC_VOID: {
16687 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(Val: N);
16688 Chain = Intrin->getChain();
16689 // Intrin->getBasePtr() oddly does not get what we want.
16690 Base = Intrin->getOperand(Num: 3);
16691 MMO = Intrin->getMemOperand();
16692 SrcOpnd = 2;
16693 break;
16694 }
16695 }
16696
16697 SDValue Src = N->getOperand(Num: SrcOpnd);
16698 MVT VecTy = Src.getValueType().getSimpleVT();
16699
16700 // All stores are done as v2f64 and possible bit cast.
16701 if (VecTy != MVT::v2f64) {
16702 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v2f64, Operand: Src);
16703 DCI.AddToWorklist(N: Src.getNode());
16704 }
16705
16706 SDValue Swap = DAG.getNode(Opcode: PPCISD::XXSWAPD, DL: dl,
16707 VTList: DAG.getVTList(VT1: MVT::v2f64, VT2: MVT::Other), N1: Chain, N2: Src);
16708 DCI.AddToWorklist(N: Swap.getNode());
16709 Chain = Swap.getValue(R: 1);
16710 SDValue StoreOps[] = { Chain, Swap, Base };
16711 SDValue Store = DAG.getMemIntrinsicNode(Opcode: PPCISD::STXVD2X, dl,
16712 VTList: DAG.getVTList(VT: MVT::Other),
16713 Ops: StoreOps, MemVT: VecTy, MMO);
16714 DCI.AddToWorklist(N: Store.getNode());
16715 return Store;
16716}
16717
16718// Handle DAG combine for STORE (FP_TO_INT F).
16719SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
16720 DAGCombinerInfo &DCI) const {
16721 SelectionDAG &DAG = DCI.DAG;
16722 SDLoc dl(N);
16723 unsigned Opcode = N->getOperand(Num: 1).getOpcode();
16724 (void)Opcode;
16725 bool Strict = N->getOperand(Num: 1)->isStrictFPOpcode();
16726
16727 assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
16728 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT)
16729 && "Not a FP_TO_INT Instruction!");
16730
16731 SDValue Val = N->getOperand(Num: 1).getOperand(i: Strict ? 1 : 0);
16732 EVT Op1VT = N->getOperand(Num: 1).getValueType();
16733 EVT ResVT = Val.getValueType();
16734
16735 if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(VT: ResVT))
16736 return SDValue();
16737
16738 // Only perform combine for conversion to i64/i32 or power9 i16/i8.
16739 bool ValidTypeForStoreFltAsInt =
16740 (Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) ||
16741 (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
16742
16743 // TODO: Lower conversion from f128 on all VSX targets
16744 if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector()))
16745 return SDValue();
16746
16747 if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) ||
16748 cast<StoreSDNode>(Val: N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
16749 return SDValue();
16750
16751 Val = convertFPToInt(Op: N->getOperand(Num: 1), DAG, Subtarget);
16752
16753 // Set number of bytes being converted.
16754 unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
16755 SDValue Ops[] = {N->getOperand(Num: 0), Val, N->getOperand(Num: 2),
16756 DAG.getIntPtrConstant(Val: ByteSize, DL: dl, isTarget: false),
16757 DAG.getValueType(Op1VT)};
16758
16759 Val = DAG.getMemIntrinsicNode(Opcode: PPCISD::ST_VSR_SCAL_INT, dl,
16760 VTList: DAG.getVTList(VT: MVT::Other), Ops,
16761 MemVT: cast<StoreSDNode>(Val: N)->getMemoryVT(),
16762 MMO: cast<StoreSDNode>(Val: N)->getMemOperand());
16763
16764 return Val;
16765}
16766
16767static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
16768 // Check that the source of the element keeps flipping
16769 // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
16770 bool PrevElemFromFirstVec = Mask[0] < NumElts;
16771 for (int i = 1, e = Mask.size(); i < e; i++) {
16772 if (PrevElemFromFirstVec && Mask[i] < NumElts)
16773 return false;
16774 if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
16775 return false;
16776 PrevElemFromFirstVec = !PrevElemFromFirstVec;
16777 }
16778 return true;
16779}
16780
16781static bool isSplatBV(SDValue Op) {
16782 if (Op.getOpcode() != ISD::BUILD_VECTOR)
16783 return false;
16784 SDValue FirstOp;
16785
16786 // Find first non-undef input.
16787 for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
16788 FirstOp = Op.getOperand(i);
16789 if (!FirstOp.isUndef())
16790 break;
16791 }
16792
16793 // All inputs are undef or the same as the first non-undef input.
16794 for (int i = 1, e = Op.getNumOperands(); i < e; i++)
16795 if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
16796 return false;
16797 return true;
16798}
16799
16800static SDValue isScalarToVec(SDValue Op) {
16801 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
16802 return Op;
16803 if (Op.getOpcode() != ISD::BITCAST)
16804 return SDValue();
16805 Op = Op.getOperand(i: 0);
16806 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
16807 return Op;
16808 return SDValue();
16809}
16810
16811// Fix up the shuffle mask to account for the fact that the result of
16812// scalar_to_vector is not in lane zero. This just takes all values in
16813// the ranges specified by the min/max indices and adds the number of
16814// elements required to ensure each element comes from the respective
16815// position in the valid lane.
16816// On little endian, that's just the corresponding element in the other
16817// half of the vector. On big endian, it is in the same half but right
16818// justified rather than left justified in that half.
16819static void fixupShuffleMaskForPermutedSToV(
16820 SmallVectorImpl<int> &ShuffV, int LHSFirstElt, int LHSLastElt,
16821 int RHSFirstElt, int RHSLastElt, int HalfVec, unsigned LHSNumValidElts,
16822 unsigned RHSNumValidElts, const PPCSubtarget &Subtarget) {
16823 int LHSEltFixup =
16824 Subtarget.isLittleEndian() ? HalfVec : HalfVec - LHSNumValidElts;
16825 int RHSEltFixup =
16826 Subtarget.isLittleEndian() ? HalfVec : HalfVec - RHSNumValidElts;
16827 for (int I = 0, E = ShuffV.size(); I < E; ++I) {
16828 int Idx = ShuffV[I];
16829 if (Idx >= LHSFirstElt && Idx <= LHSLastElt)
16830 ShuffV[I] += LHSEltFixup;
16831 else if (Idx >= RHSFirstElt && Idx <= RHSLastElt)
16832 ShuffV[I] += RHSEltFixup;
16833 }
16834}
16835
16836// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
16837// the original is:
16838// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
16839// In such a case, just change the shuffle mask to extract the element
16840// from the permuted index.
16841static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
16842 const PPCSubtarget &Subtarget) {
16843 SDLoc dl(OrigSToV);
16844 EVT VT = OrigSToV.getValueType();
16845 assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
16846 "Expecting a SCALAR_TO_VECTOR here");
16847 SDValue Input = OrigSToV.getOperand(i: 0);
16848
16849 if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
16850 ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: Input.getOperand(i: 1));
16851 SDValue OrigVector = Input.getOperand(i: 0);
16852
16853 // Can't handle non-const element indices or different vector types
16854 // for the input to the extract and the output of the scalar_to_vector.
16855 if (Idx && VT == OrigVector.getValueType()) {
16856 unsigned NumElts = VT.getVectorNumElements();
16857 assert(
16858 NumElts > 1 &&
16859 "Cannot produce a permuted scalar_to_vector for one element vector");
16860 SmallVector<int, 16> NewMask(NumElts, -1);
16861 unsigned ResultInElt = NumElts / 2;
16862 ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
16863 NewMask[ResultInElt] = Idx->getZExtValue();
16864 return DAG.getVectorShuffle(VT, dl, N1: OrigVector, N2: OrigVector, Mask: NewMask);
16865 }
16866 }
16867 return DAG.getNode(Opcode: PPCISD::SCALAR_TO_VECTOR_PERMUTED, DL: dl, VT,
16868 Operand: OrigSToV.getOperand(i: 0));
16869}
16870
16871static bool isShuffleMaskInRange(const SmallVectorImpl<int> &ShuffV,
16872 int HalfVec, int LHSLastElementDefined,
16873 int RHSLastElementDefined) {
16874 for (int Index : ShuffV) {
16875 if (Index < 0) // Skip explicitly undefined mask indices.
16876 continue;
16877 // Handle first input vector of the vector_shuffle.
16878 if ((LHSLastElementDefined >= 0) && (Index < HalfVec) &&
16879 (Index > LHSLastElementDefined))
16880 return false;
16881 // Handle second input vector of the vector_shuffle.
16882 if ((RHSLastElementDefined >= 0) &&
16883 (Index > HalfVec + RHSLastElementDefined))
16884 return false;
16885 }
16886 return true;
16887}
16888
16889static SDValue generateSToVPermutedForVecShuffle(
16890 int ScalarSize, uint64_t ShuffleEltWidth, unsigned &NumValidElts,
16891 int FirstElt, int &LastElt, SDValue VecShuffOperand, SDValue SToVNode,
16892 SelectionDAG &DAG, const PPCSubtarget &Subtarget) {
16893 EVT VecShuffOperandType = VecShuffOperand.getValueType();
16894 // Set up the values for the shuffle vector fixup.
16895 NumValidElts = ScalarSize / VecShuffOperandType.getScalarSizeInBits();
16896 // The last element depends on if the input comes from the LHS or RHS.
16897 //
16898 // For example:
16899 // (shuff (s_to_v i32), (bitcast (s_to_v i64), v4i32), ...)
16900 //
16901 // For the LHS: The last element that comes from the LHS is actually 0, not 3
16902 // because elements 1 and higher of a scalar_to_vector are undefined.
16903 // For the RHS: The last element that comes from the RHS is actually 5, not 7
16904 // because elements 1 and higher of a scalar_to_vector are undefined.
16905 // It is also not 4 because the original scalar_to_vector is wider and
16906 // actually contains two i32 elements.
16907 LastElt = (uint64_t)ScalarSize > ShuffleEltWidth
16908 ? ScalarSize / ShuffleEltWidth - 1 + FirstElt
16909 : FirstElt;
16910 SDValue SToVPermuted = getSToVPermuted(OrigSToV: SToVNode, DAG, Subtarget);
16911 if (SToVPermuted.getValueType() != VecShuffOperandType)
16912 SToVPermuted = DAG.getBitcast(VT: VecShuffOperandType, V: SToVPermuted);
16913 return SToVPermuted;
16914}
16915
16916// On little endian subtargets, combine shuffles such as:
16917// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
16918// into:
16919// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
16920// because the latter can be matched to a single instruction merge.
16921// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
16922// to put the value into element zero. Adjust the shuffle mask so that the
16923// vector can remain in permuted form (to prevent a swap prior to a shuffle).
16924// On big endian targets, this is still useful for SCALAR_TO_VECTOR
16925// nodes with elements smaller than doubleword because all the ways
16926// of getting scalar data into a vector register put the value in the
16927// rightmost element of the left half of the vector.
16928SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
16929 SelectionDAG &DAG) const {
16930 SDValue LHS = SVN->getOperand(Num: 0);
16931 SDValue RHS = SVN->getOperand(Num: 1);
16932 auto Mask = SVN->getMask();
16933 int NumElts = LHS.getValueType().getVectorNumElements();
16934 SDValue Res(SVN, 0);
16935 SDLoc dl(SVN);
16936 bool IsLittleEndian = Subtarget.isLittleEndian();
16937
16938 // On big endian targets this is only useful for subtargets with direct moves.
16939 // On little endian targets it would be useful for all subtargets with VSX.
16940 // However adding special handling for LE subtargets without direct moves
16941 // would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)
16942 // which includes direct moves.
16943 if (!Subtarget.hasDirectMove())
16944 return Res;
16945
16946 // If this is not a shuffle of a shuffle and the first element comes from
16947 // the second vector, canonicalize to the commuted form. This will make it
16948 // more likely to match one of the single instruction patterns.
16949 if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
16950 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
16951 std::swap(a&: LHS, b&: RHS);
16952 Res = DAG.getCommutedVectorShuffle(SV: *SVN);
16953
16954 if (!isa<ShuffleVectorSDNode>(Val: Res))
16955 return Res;
16956
16957 Mask = cast<ShuffleVectorSDNode>(Val&: Res)->getMask();
16958 }
16959
16960 // Adjust the shuffle mask if either input vector comes from a
16961 // SCALAR_TO_VECTOR and keep the respective input vector in permuted
16962 // form (to prevent the need for a swap).
16963 SmallVector<int, 16> ShuffV(Mask);
16964 SDValue SToVLHS = isScalarToVec(Op: LHS);
16965 SDValue SToVRHS = isScalarToVec(Op: RHS);
16966 if (SToVLHS || SToVRHS) {
16967 EVT VT = SVN->getValueType(ResNo: 0);
16968 uint64_t ShuffleEltWidth = VT.getVectorElementType().getSizeInBits();
16969 int ShuffleNumElts = ShuffV.size();
16970 int HalfVec = ShuffleNumElts / 2;
16971 // The width of the "valid lane" (i.e. the lane that contains the value that
16972 // is vectorized) needs to be expressed in terms of the number of elements
16973 // of the shuffle. It is thereby the ratio of the values before and after
16974 // any bitcast, which will be set later on if the LHS or RHS are
16975 // SCALAR_TO_VECTOR nodes.
16976 unsigned LHSNumValidElts = HalfVec;
16977 unsigned RHSNumValidElts = HalfVec;
16978
16979 // Initially assume that neither input is permuted. These will be adjusted
16980 // accordingly if either input is. Note, that -1 means that all elements
16981 // are undefined.
16982 int LHSFirstElt = 0;
16983 int RHSFirstElt = ShuffleNumElts;
16984 int LHSLastElt = -1;
16985 int RHSLastElt = -1;
16986
16987 // Get the permuted scalar to vector nodes for the source(s) that come from
16988 // ISD::SCALAR_TO_VECTOR.
16989 // On big endian systems, this only makes sense for element sizes smaller
16990 // than 64 bits since for 64-bit elements, all instructions already put
16991 // the value into element zero. Since scalar size of LHS and RHS may differ
16992 // after isScalarToVec, this should be checked using their own sizes.
16993 int LHSScalarSize = 0;
16994 int RHSScalarSize = 0;
16995 if (SToVLHS) {
16996 LHSScalarSize = SToVLHS.getValueType().getScalarSizeInBits();
16997 if (!IsLittleEndian && LHSScalarSize >= 64)
16998 return Res;
16999 }
17000 if (SToVRHS) {
17001 RHSScalarSize = SToVRHS.getValueType().getScalarSizeInBits();
17002 if (!IsLittleEndian && RHSScalarSize >= 64)
17003 return Res;
17004 }
17005 if (LHSScalarSize != 0)
17006 LHS = generateSToVPermutedForVecShuffle(
17007 ScalarSize: LHSScalarSize, ShuffleEltWidth, NumValidElts&: LHSNumValidElts, FirstElt: LHSFirstElt,
17008 LastElt&: LHSLastElt, VecShuffOperand: LHS, SToVNode: SToVLHS, DAG, Subtarget);
17009 if (RHSScalarSize != 0)
17010 RHS = generateSToVPermutedForVecShuffle(
17011 ScalarSize: RHSScalarSize, ShuffleEltWidth, NumValidElts&: RHSNumValidElts, FirstElt: RHSFirstElt,
17012 LastElt&: RHSLastElt, VecShuffOperand: RHS, SToVNode: SToVRHS, DAG, Subtarget);
17013
17014 if (!isShuffleMaskInRange(ShuffV, HalfVec, LHSLastElementDefined: LHSLastElt, RHSLastElementDefined: RHSLastElt))
17015 return Res;
17016
17017 // Fix up the shuffle mask to reflect where the desired element actually is.
17018 // The minimum and maximum indices that correspond to element zero for both
17019 // the LHS and RHS are computed and will control which shuffle mask entries
17020 // are to be changed. For example, if the RHS is permuted, any shuffle mask
17021 // entries in the range [RHSFirstElt,RHSLastElt] will be adjusted.
17022 fixupShuffleMaskForPermutedSToV(
17023 ShuffV, LHSFirstElt, LHSLastElt, RHSFirstElt, RHSLastElt, HalfVec,
17024 LHSNumValidElts, RHSNumValidElts, Subtarget);
17025 Res = DAG.getVectorShuffle(VT: SVN->getValueType(ResNo: 0), dl, N1: LHS, N2: RHS, Mask: ShuffV);
17026
17027 // We may have simplified away the shuffle. We won't be able to do anything
17028 // further with it here.
17029 if (!isa<ShuffleVectorSDNode>(Val: Res))
17030 return Res;
17031 Mask = cast<ShuffleVectorSDNode>(Val&: Res)->getMask();
17032 }
17033
17034 SDValue TheSplat = IsLittleEndian ? RHS : LHS;
17035 // The common case after we commuted the shuffle is that the RHS is a splat
17036 // and we have elements coming in from the splat at indices that are not
17037 // conducive to using a merge.
17038 // Example:
17039 // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
17040 if (!isSplatBV(Op: TheSplat))
17041 return Res;
17042
17043 // We are looking for a mask such that all even elements are from
17044 // one vector and all odd elements from the other.
17045 if (!isAlternatingShuffMask(Mask, NumElts))
17046 return Res;
17047
17048 // Adjust the mask so we are pulling in the same index from the splat
17049 // as the index from the interesting vector in consecutive elements.
17050 if (IsLittleEndian) {
17051 // Example (even elements from first vector):
17052 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
17053 if (Mask[0] < NumElts)
17054 for (int i = 1, e = Mask.size(); i < e; i += 2) {
17055 if (ShuffV[i] < 0)
17056 continue;
17057 // If element from non-splat is undef, pick first element from splat.
17058 ShuffV[i] = (ShuffV[i - 1] >= 0 ? ShuffV[i - 1] : 0) + NumElts;
17059 }
17060 // Example (odd elements from first vector):
17061 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
17062 else
17063 for (int i = 0, e = Mask.size(); i < e; i += 2) {
17064 if (ShuffV[i] < 0)
17065 continue;
17066 // If element from non-splat is undef, pick first element from splat.
17067 ShuffV[i] = (ShuffV[i + 1] >= 0 ? ShuffV[i + 1] : 0) + NumElts;
17068 }
17069 } else {
17070 // Example (even elements from first vector):
17071 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
17072 if (Mask[0] < NumElts)
17073 for (int i = 0, e = Mask.size(); i < e; i += 2) {
17074 if (ShuffV[i] < 0)
17075 continue;
17076 // If element from non-splat is undef, pick first element from splat.
17077 ShuffV[i] = ShuffV[i + 1] >= 0 ? ShuffV[i + 1] - NumElts : 0;
17078 }
17079 // Example (odd elements from first vector):
17080 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
17081 else
17082 for (int i = 1, e = Mask.size(); i < e; i += 2) {
17083 if (ShuffV[i] < 0)
17084 continue;
17085 // If element from non-splat is undef, pick first element from splat.
17086 ShuffV[i] = ShuffV[i - 1] >= 0 ? ShuffV[i - 1] - NumElts : 0;
17087 }
17088 }
17089
17090 // If the RHS has undefs, we need to remove them since we may have created
17091 // a shuffle that adds those instead of the splat value.
17092 SDValue SplatVal =
17093 cast<BuildVectorSDNode>(Val: TheSplat.getNode())->getSplatValue();
17094 TheSplat = DAG.getSplatBuildVector(VT: TheSplat.getValueType(), DL: dl, Op: SplatVal);
17095
17096 if (IsLittleEndian)
17097 RHS = TheSplat;
17098 else
17099 LHS = TheSplat;
17100 return DAG.getVectorShuffle(VT: SVN->getValueType(ResNo: 0), dl, N1: LHS, N2: RHS, Mask: ShuffV);
17101}
17102
17103SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
17104 LSBaseSDNode *LSBase,
17105 DAGCombinerInfo &DCI) const {
17106 assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
17107 "Not a reverse memop pattern!");
17108
17109 auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
17110 auto Mask = SVN->getMask();
17111 int i = 0;
17112 auto I = Mask.rbegin();
17113 auto E = Mask.rend();
17114
17115 for (; I != E; ++I) {
17116 if (*I != i)
17117 return false;
17118 i++;
17119 }
17120 return true;
17121 };
17122
17123 SelectionDAG &DAG = DCI.DAG;
17124 EVT VT = SVN->getValueType(ResNo: 0);
17125
17126 if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
17127 return SDValue();
17128
17129 // Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
17130 // See comment in PPCVSXSwapRemoval.cpp.
17131 // It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
17132 if (!Subtarget.hasP9Vector())
17133 return SDValue();
17134
17135 if(!IsElementReverse(SVN))
17136 return SDValue();
17137
17138 if (LSBase->getOpcode() == ISD::LOAD) {
17139 // If the load return value 0 has more than one user except the
17140 // shufflevector instruction, it is not profitable to replace the
17141 // shufflevector with a reverse load.
17142 for (SDUse &Use : LSBase->uses())
17143 if (Use.getResNo() == 0 &&
17144 Use.getUser()->getOpcode() != ISD::VECTOR_SHUFFLE)
17145 return SDValue();
17146
17147 SDLoc dl(LSBase);
17148 SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
17149 return DAG.getMemIntrinsicNode(
17150 Opcode: PPCISD::LOAD_VEC_BE, dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Other), Ops: LoadOps,
17151 MemVT: LSBase->getMemoryVT(), MMO: LSBase->getMemOperand());
17152 }
17153
17154 if (LSBase->getOpcode() == ISD::STORE) {
17155 // If there are other uses of the shuffle, the swap cannot be avoided.
17156 // Forcing the use of an X-Form (since swapped stores only have
17157 // X-Forms) without removing the swap is unprofitable.
17158 if (!SVN->hasOneUse())
17159 return SDValue();
17160
17161 SDLoc dl(LSBase);
17162 SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(Num: 0),
17163 LSBase->getBasePtr()};
17164 return DAG.getMemIntrinsicNode(
17165 Opcode: PPCISD::STORE_VEC_BE, dl, VTList: DAG.getVTList(VT: MVT::Other), Ops: StoreOps,
17166 MemVT: LSBase->getMemoryVT(), MMO: LSBase->getMemOperand());
17167 }
17168
17169 llvm_unreachable("Expected a load or store node here");
17170}
17171
17172static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
17173 unsigned IntrinsicID = Intrin.getConstantOperandVal(i: 1);
17174 if (IntrinsicID == Intrinsic::ppc_stdcx)
17175 StoreWidth = 8;
17176 else if (IntrinsicID == Intrinsic::ppc_stwcx)
17177 StoreWidth = 4;
17178 else if (IntrinsicID == Intrinsic::ppc_sthcx)
17179 StoreWidth = 2;
17180 else if (IntrinsicID == Intrinsic::ppc_stbcx)
17181 StoreWidth = 1;
17182 else
17183 return false;
17184 return true;
17185}
17186
17187static SDValue DAGCombineAddc(SDNode *N,
17188 llvm::PPCTargetLowering::DAGCombinerInfo &DCI) {
17189 if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(Value: 1)) {
17190 // (ADDC (ADDE 0, 0, C), -1) -> C
17191 SDValue LHS = N->getOperand(Num: 0);
17192 SDValue RHS = N->getOperand(Num: 1);
17193 if (LHS->getOpcode() == PPCISD::ADDE &&
17194 isNullConstant(V: LHS->getOperand(Num: 0)) &&
17195 isNullConstant(V: LHS->getOperand(Num: 1)) && isAllOnesConstant(V: RHS)) {
17196 return DCI.CombineTo(N, Res0: SDValue(N, 0), Res1: LHS->getOperand(Num: 2));
17197 }
17198 }
17199 return SDValue();
17200}
17201
17202SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
17203 DAGCombinerInfo &DCI) const {
17204 SelectionDAG &DAG = DCI.DAG;
17205 SDLoc dl(N);
17206 switch (N->getOpcode()) {
17207 default: break;
17208 case ISD::ADD:
17209 return combineADD(N, DCI);
17210 case ISD::AND: {
17211 // We don't want (and (zext (shift...)), C) if C fits in the width of the
17212 // original input as that will prevent us from selecting optimal rotates.
17213 // This only matters if the input to the extend is i32 widened to i64.
17214 SDValue Op1 = N->getOperand(Num: 0);
17215 SDValue Op2 = N->getOperand(Num: 1);
17216 if ((Op1.getOpcode() != ISD::ZERO_EXTEND &&
17217 Op1.getOpcode() != ISD::ANY_EXTEND) ||
17218 !isa<ConstantSDNode>(Val: Op2) || N->getValueType(ResNo: 0) != MVT::i64 ||
17219 Op1.getOperand(i: 0).getValueType() != MVT::i32)
17220 break;
17221 SDValue NarrowOp = Op1.getOperand(i: 0);
17222 if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL &&
17223 NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)
17224 break;
17225
17226 uint64_t Imm = Op2->getAsZExtVal();
17227 // Make sure that the constant is narrow enough to fit in the narrow type.
17228 if (!isUInt<32>(x: Imm))
17229 break;
17230 SDValue ConstOp = DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32);
17231 SDValue NarrowAnd = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: NarrowOp, N2: ConstOp);
17232 return DAG.getZExtOrTrunc(Op: NarrowAnd, DL: dl, VT: N->getValueType(ResNo: 0));
17233 }
17234 case ISD::SHL:
17235 return combineSHL(N, DCI);
17236 case ISD::SRA:
17237 return combineSRA(N, DCI);
17238 case ISD::SRL:
17239 return combineSRL(N, DCI);
17240 case ISD::MUL:
17241 return combineMUL(N, DCI);
17242 case ISD::FMA:
17243 case PPCISD::FNMSUB:
17244 return combineFMALike(N, DCI);
17245 case PPCISD::SHL:
17246 if (isNullConstant(V: N->getOperand(Num: 0))) // 0 << V -> 0.
17247 return N->getOperand(Num: 0);
17248 break;
17249 case PPCISD::SRL:
17250 if (isNullConstant(V: N->getOperand(Num: 0))) // 0 >>u V -> 0.
17251 return N->getOperand(Num: 0);
17252 break;
17253 case PPCISD::SRA:
17254 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0))) {
17255 if (C->isZero() || // 0 >>s V -> 0.
17256 C->isAllOnes()) // -1 >>s V -> -1.
17257 return N->getOperand(Num: 0);
17258 }
17259 break;
17260 case ISD::SIGN_EXTEND:
17261 case ISD::ZERO_EXTEND:
17262 case ISD::ANY_EXTEND:
17263 return DAGCombineExtBoolTrunc(N, DCI);
17264 case ISD::TRUNCATE:
17265 return combineTRUNCATE(N, DCI);
17266 case ISD::SETCC:
17267 if (SDValue CSCC = combineSetCC(N, DCI))
17268 return CSCC;
17269 [[fallthrough]];
17270 case ISD::SELECT_CC:
17271 return DAGCombineTruncBoolExt(N, DCI);
17272 case ISD::SINT_TO_FP:
17273 case ISD::UINT_TO_FP:
17274 return combineFPToIntToFP(N, DCI);
17275 case ISD::VECTOR_SHUFFLE:
17276 if (ISD::isNormalLoad(N: N->getOperand(Num: 0).getNode())) {
17277 LSBaseSDNode* LSBase = cast<LSBaseSDNode>(Val: N->getOperand(Num: 0));
17278 return combineVReverseMemOP(SVN: cast<ShuffleVectorSDNode>(Val: N), LSBase, DCI);
17279 }
17280 return combineVectorShuffle(SVN: cast<ShuffleVectorSDNode>(Val: N), DAG&: DCI.DAG);
17281 case ISD::STORE: {
17282
17283 EVT Op1VT = N->getOperand(Num: 1).getValueType();
17284 unsigned Opcode = N->getOperand(Num: 1).getOpcode();
17285
17286 if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
17287 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) {
17288 SDValue Val = combineStoreFPToInt(N, DCI);
17289 if (Val)
17290 return Val;
17291 }
17292
17293 if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
17294 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1));
17295 SDValue Val= combineVReverseMemOP(SVN, LSBase: cast<LSBaseSDNode>(Val: N), DCI);
17296 if (Val)
17297 return Val;
17298 }
17299
17300 // Turn STORE (BSWAP) -> sthbrx/stwbrx.
17301 if (cast<StoreSDNode>(Val: N)->isUnindexed() && Opcode == ISD::BSWAP &&
17302 N->getOperand(Num: 1).getNode()->hasOneUse() &&
17303 (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
17304 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
17305
17306 // STBRX can only handle simple types and it makes no sense to store less
17307 // two bytes in byte-reversed order.
17308 EVT mVT = cast<StoreSDNode>(Val: N)->getMemoryVT();
17309 if (mVT.isExtended() || mVT.getSizeInBits() < 16)
17310 break;
17311
17312 SDValue BSwapOp = N->getOperand(Num: 1).getOperand(i: 0);
17313 // Do an any-extend to 32-bits if this is a half-word input.
17314 if (BSwapOp.getValueType() == MVT::i16)
17315 BSwapOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: BSwapOp);
17316
17317 // If the type of BSWAP operand is wider than stored memory width
17318 // it need to be shifted to the right side before STBRX.
17319 if (Op1VT.bitsGT(VT: mVT)) {
17320 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
17321 BSwapOp = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op1VT, N1: BSwapOp,
17322 N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
17323 // Need to truncate if this is a bswap of i64 stored as i32/i16.
17324 if (Op1VT == MVT::i64)
17325 BSwapOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: BSwapOp);
17326 }
17327
17328 SDValue Ops[] = {
17329 N->getOperand(Num: 0), BSwapOp, N->getOperand(Num: 2), DAG.getValueType(mVT)
17330 };
17331 return
17332 DAG.getMemIntrinsicNode(Opcode: PPCISD::STBRX, dl, VTList: DAG.getVTList(VT: MVT::Other),
17333 Ops, MemVT: cast<StoreSDNode>(Val: N)->getMemoryVT(),
17334 MMO: cast<StoreSDNode>(Val: N)->getMemOperand());
17335 }
17336
17337 // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
17338 // So it can increase the chance of CSE constant construction.
17339 if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
17340 isa<ConstantSDNode>(Val: N->getOperand(Num: 1)) && Op1VT == MVT::i32) {
17341 // Need to sign-extended to 64-bits to handle negative values.
17342 EVT MemVT = cast<StoreSDNode>(Val: N)->getMemoryVT();
17343 uint64_t Val64 = SignExtend64(X: N->getConstantOperandVal(Num: 1),
17344 B: MemVT.getSizeInBits());
17345 SDValue Const64 = DAG.getConstant(Val: Val64, DL: dl, VT: MVT::i64);
17346
17347 auto *ST = cast<StoreSDNode>(Val: N);
17348 SDValue NewST = DAG.getStore(Chain: ST->getChain(), dl, Val: Const64,
17349 Ptr: ST->getBasePtr(), Offset: ST->getOffset(), SVT: MemVT,
17350 MMO: ST->getMemOperand(), AM: ST->getAddressingMode(),
17351 /*IsTruncating=*/true);
17352 // Note we use CombineTo here to prevent DAGCombiner from visiting the
17353 // new store which will change the constant by removing non-demanded bits.
17354 return ST->isUnindexed()
17355 ? DCI.CombineTo(N, Res: NewST, /*AddTo=*/false)
17356 : DCI.CombineTo(N, Res0: NewST, Res1: NewST.getValue(R: 1), /*AddTo=*/false);
17357 }
17358
17359 // For little endian, VSX stores require generating xxswapd/lxvd2x.
17360 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
17361 if (Op1VT.isSimple()) {
17362 MVT StoreVT = Op1VT.getSimpleVT();
17363 if (Subtarget.needsSwapsForVSXMemOps() &&
17364 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
17365 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
17366 return expandVSXStoreForLE(N, DCI);
17367 }
17368 break;
17369 }
17370 case ISD::LOAD: {
17371 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
17372 EVT VT = LD->getValueType(ResNo: 0);
17373
17374 // For little endian, VSX loads require generating lxvd2x/xxswapd.
17375 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
17376 if (VT.isSimple()) {
17377 MVT LoadVT = VT.getSimpleVT();
17378 if (Subtarget.needsSwapsForVSXMemOps() &&
17379 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
17380 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
17381 return expandVSXLoadForLE(N, DCI);
17382 }
17383
17384 // We sometimes end up with a 64-bit integer load, from which we extract
17385 // two single-precision floating-point numbers. This happens with
17386 // std::complex<float>, and other similar structures, because of the way we
17387 // canonicalize structure copies. However, if we lack direct moves,
17388 // then the final bitcasts from the extracted integer values to the
17389 // floating-point numbers turn into store/load pairs. Even with direct moves,
17390 // just loading the two floating-point numbers is likely better.
17391 auto ReplaceTwoFloatLoad = [&]() {
17392 if (VT != MVT::i64)
17393 return false;
17394
17395 if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
17396 LD->isVolatile())
17397 return false;
17398
17399 // We're looking for a sequence like this:
17400 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
17401 // t16: i64 = srl t13, Constant:i32<32>
17402 // t17: i32 = truncate t16
17403 // t18: f32 = bitcast t17
17404 // t19: i32 = truncate t13
17405 // t20: f32 = bitcast t19
17406
17407 if (!LD->hasNUsesOfValue(NUses: 2, Value: 0))
17408 return false;
17409
17410 auto UI = LD->user_begin();
17411 while (UI.getUse().getResNo() != 0) ++UI;
17412 SDNode *Trunc = *UI++;
17413 while (UI.getUse().getResNo() != 0) ++UI;
17414 SDNode *RightShift = *UI;
17415 if (Trunc->getOpcode() != ISD::TRUNCATE)
17416 std::swap(a&: Trunc, b&: RightShift);
17417
17418 if (Trunc->getOpcode() != ISD::TRUNCATE ||
17419 Trunc->getValueType(ResNo: 0) != MVT::i32 ||
17420 !Trunc->hasOneUse())
17421 return false;
17422 if (RightShift->getOpcode() != ISD::SRL ||
17423 !isa<ConstantSDNode>(Val: RightShift->getOperand(Num: 1)) ||
17424 RightShift->getConstantOperandVal(Num: 1) != 32 ||
17425 !RightShift->hasOneUse())
17426 return false;
17427
17428 SDNode *Trunc2 = *RightShift->user_begin();
17429 if (Trunc2->getOpcode() != ISD::TRUNCATE ||
17430 Trunc2->getValueType(ResNo: 0) != MVT::i32 ||
17431 !Trunc2->hasOneUse())
17432 return false;
17433
17434 SDNode *Bitcast = *Trunc->user_begin();
17435 SDNode *Bitcast2 = *Trunc2->user_begin();
17436
17437 if (Bitcast->getOpcode() != ISD::BITCAST ||
17438 Bitcast->getValueType(ResNo: 0) != MVT::f32)
17439 return false;
17440 if (Bitcast2->getOpcode() != ISD::BITCAST ||
17441 Bitcast2->getValueType(ResNo: 0) != MVT::f32)
17442 return false;
17443
17444 if (Subtarget.isLittleEndian())
17445 std::swap(a&: Bitcast, b&: Bitcast2);
17446
17447 // Bitcast has the second float (in memory-layout order) and Bitcast2
17448 // has the first one.
17449
17450 SDValue BasePtr = LD->getBasePtr();
17451 if (LD->isIndexed()) {
17452 assert(LD->getAddressingMode() == ISD::PRE_INC &&
17453 "Non-pre-inc AM on PPC?");
17454 BasePtr =
17455 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
17456 N2: LD->getOffset());
17457 }
17458
17459 auto MMOFlags =
17460 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
17461 SDValue FloatLoad = DAG.getLoad(VT: MVT::f32, dl, Chain: LD->getChain(), Ptr: BasePtr,
17462 PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign(),
17463 MMOFlags, AAInfo: LD->getAAInfo());
17464 SDValue AddPtr =
17465 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(),
17466 N1: BasePtr, N2: DAG.getIntPtrConstant(Val: 4, DL: dl));
17467 SDValue FloatLoad2 = DAG.getLoad(
17468 VT: MVT::f32, dl, Chain: SDValue(FloatLoad.getNode(), 1), Ptr: AddPtr,
17469 PtrInfo: LD->getPointerInfo().getWithOffset(O: 4),
17470 Alignment: commonAlignment(A: LD->getAlign(), Offset: 4), MMOFlags, AAInfo: LD->getAAInfo());
17471
17472 if (LD->isIndexed()) {
17473 // Note that DAGCombine should re-form any pre-increment load(s) from
17474 // what is produced here if that makes sense.
17475 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: BasePtr);
17476 }
17477
17478 DCI.CombineTo(N: Bitcast2, Res: FloatLoad);
17479 DCI.CombineTo(N: Bitcast, Res: FloatLoad2);
17480
17481 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, LD->isIndexed() ? 2 : 1),
17482 To: SDValue(FloatLoad2.getNode(), 1));
17483 return true;
17484 };
17485
17486 if (ReplaceTwoFloatLoad())
17487 return SDValue(N, 0);
17488
17489 EVT MemVT = LD->getMemoryVT();
17490 Type *Ty = MemVT.getTypeForEVT(Context&: *DAG.getContext());
17491 Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
17492 if (LD->isUnindexed() && VT.isVector() &&
17493 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
17494 // P8 and later hardware should just use LOAD.
17495 !Subtarget.hasP8Vector() &&
17496 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
17497 VT == MVT::v4f32))) &&
17498 LD->getAlign() < ABIAlignment) {
17499 // This is a type-legal unaligned Altivec load.
17500 SDValue Chain = LD->getChain();
17501 SDValue Ptr = LD->getBasePtr();
17502 bool isLittleEndian = Subtarget.isLittleEndian();
17503
17504 // This implements the loading of unaligned vectors as described in
17505 // the venerable Apple Velocity Engine overview. Specifically:
17506 // https://developer.apple.com/hardwaredrivers/ve/alignment.html
17507 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
17508 //
17509 // The general idea is to expand a sequence of one or more unaligned
17510 // loads into an alignment-based permutation-control instruction (lvsl
17511 // or lvsr), a series of regular vector loads (which always truncate
17512 // their input address to an aligned address), and a series of
17513 // permutations. The results of these permutations are the requested
17514 // loaded values. The trick is that the last "extra" load is not taken
17515 // from the address you might suspect (sizeof(vector) bytes after the
17516 // last requested load), but rather sizeof(vector) - 1 bytes after the
17517 // last requested vector. The point of this is to avoid a page fault if
17518 // the base address happened to be aligned. This works because if the
17519 // base address is aligned, then adding less than a full vector length
17520 // will cause the last vector in the sequence to be (re)loaded.
17521 // Otherwise, the next vector will be fetched as you might suspect was
17522 // necessary.
17523
17524 // We might be able to reuse the permutation generation from
17525 // a different base address offset from this one by an aligned amount.
17526 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
17527 // optimization later.
17528 Intrinsic::ID Intr, IntrLD, IntrPerm;
17529 MVT PermCntlTy, PermTy, LDTy;
17530 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
17531 : Intrinsic::ppc_altivec_lvsl;
17532 IntrLD = Intrinsic::ppc_altivec_lvx;
17533 IntrPerm = Intrinsic::ppc_altivec_vperm;
17534 PermCntlTy = MVT::v16i8;
17535 PermTy = MVT::v4i32;
17536 LDTy = MVT::v4i32;
17537
17538 SDValue PermCntl = BuildIntrinsicOp(IID: Intr, Op: Ptr, DAG, dl, DestVT: PermCntlTy);
17539
17540 // Create the new MMO for the new base load. It is like the original MMO,
17541 // but represents an area in memory almost twice the vector size centered
17542 // on the original address. If the address is unaligned, we might start
17543 // reading up to (sizeof(vector)-1) bytes below the address of the
17544 // original unaligned load.
17545 MachineFunction &MF = DAG.getMachineFunction();
17546 MachineMemOperand *BaseMMO =
17547 MF.getMachineMemOperand(MMO: LD->getMemOperand(),
17548 Offset: -(int64_t)MemVT.getStoreSize()+1,
17549 Size: 2*MemVT.getStoreSize()-1);
17550
17551 // Create the new base load.
17552 SDValue LDXIntID =
17553 DAG.getTargetConstant(Val: IntrLD, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()));
17554 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
17555 SDValue BaseLoad =
17556 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
17557 VTList: DAG.getVTList(VT1: PermTy, VT2: MVT::Other),
17558 Ops: BaseLoadOps, MemVT: LDTy, MMO: BaseMMO);
17559
17560 // Note that the value of IncOffset (which is provided to the next
17561 // load's pointer info offset value, and thus used to calculate the
17562 // alignment), and the value of IncValue (which is actually used to
17563 // increment the pointer value) are different! This is because we
17564 // require the next load to appear to be aligned, even though it
17565 // is actually offset from the base pointer by a lesser amount.
17566 int IncOffset = VT.getSizeInBits() / 8;
17567 int IncValue = IncOffset;
17568
17569 // Walk (both up and down) the chain looking for another load at the real
17570 // (aligned) offset (the alignment of the other load does not matter in
17571 // this case). If found, then do not use the offset reduction trick, as
17572 // that will prevent the loads from being later combined (as they would
17573 // otherwise be duplicates).
17574 if (!findConsecutiveLoad(LD, DAG))
17575 --IncValue;
17576
17577 SDValue Increment =
17578 DAG.getConstant(Val: IncValue, DL: dl, VT: getPointerTy(DL: MF.getDataLayout()));
17579 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Ptr.getValueType(), N1: Ptr, N2: Increment);
17580
17581 MachineMemOperand *ExtraMMO =
17582 MF.getMachineMemOperand(MMO: LD->getMemOperand(),
17583 Offset: 1, Size: 2*MemVT.getStoreSize()-1);
17584 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
17585 SDValue ExtraLoad =
17586 DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl,
17587 VTList: DAG.getVTList(VT1: PermTy, VT2: MVT::Other),
17588 Ops: ExtraLoadOps, MemVT: LDTy, MMO: ExtraMMO);
17589
17590 SDValue TF = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other,
17591 N1: BaseLoad.getValue(R: 1), N2: ExtraLoad.getValue(R: 1));
17592
17593 // Because vperm has a big-endian bias, we must reverse the order
17594 // of the input vectors and complement the permute control vector
17595 // when generating little endian code. We have already handled the
17596 // latter by using lvsr instead of lvsl, so just reverse BaseLoad
17597 // and ExtraLoad here.
17598 SDValue Perm;
17599 if (isLittleEndian)
17600 Perm = BuildIntrinsicOp(IID: IntrPerm,
17601 Op0: ExtraLoad, Op1: BaseLoad, Op2: PermCntl, DAG, dl);
17602 else
17603 Perm = BuildIntrinsicOp(IID: IntrPerm,
17604 Op0: BaseLoad, Op1: ExtraLoad, Op2: PermCntl, DAG, dl);
17605
17606 if (VT != PermTy)
17607 Perm = Subtarget.hasAltivec()
17608 ? DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Perm)
17609 : DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: Perm,
17610 N2: DAG.getTargetConstant(Val: 1, DL: dl, VT: MVT::i64));
17611 // second argument is 1 because this rounding
17612 // is always exact.
17613
17614 // The output of the permutation is our loaded result, the TokenFactor is
17615 // our new chain.
17616 DCI.CombineTo(N, Res0: Perm, Res1: TF);
17617 return SDValue(N, 0);
17618 }
17619 }
17620 break;
17621 case ISD::INTRINSIC_WO_CHAIN: {
17622 bool isLittleEndian = Subtarget.isLittleEndian();
17623 unsigned IID = N->getConstantOperandVal(Num: 0);
17624 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
17625 : Intrinsic::ppc_altivec_lvsl);
17626 if (IID == Intr && N->getOperand(Num: 1)->getOpcode() == ISD::ADD) {
17627 SDValue Add = N->getOperand(Num: 1);
17628
17629 int Bits = 4 /* 16 byte alignment */;
17630
17631 if (DAG.MaskedValueIsZero(Op: Add->getOperand(Num: 1),
17632 Mask: APInt::getAllOnes(numBits: Bits /* alignment */)
17633 .zext(width: Add.getScalarValueSizeInBits()))) {
17634 SDNode *BasePtr = Add->getOperand(Num: 0).getNode();
17635 for (SDNode *U : BasePtr->users()) {
17636 if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17637 U->getConstantOperandVal(Num: 0) == IID) {
17638 // We've found another LVSL/LVSR, and this address is an aligned
17639 // multiple of that one. The results will be the same, so use the
17640 // one we've just found instead.
17641
17642 return SDValue(U, 0);
17643 }
17644 }
17645 }
17646
17647 if (isa<ConstantSDNode>(Val: Add->getOperand(Num: 1))) {
17648 SDNode *BasePtr = Add->getOperand(Num: 0).getNode();
17649 for (SDNode *U : BasePtr->users()) {
17650 if (U->getOpcode() == ISD::ADD &&
17651 isa<ConstantSDNode>(Val: U->getOperand(Num: 1)) &&
17652 (Add->getConstantOperandVal(Num: 1) - U->getConstantOperandVal(Num: 1)) %
17653 (1ULL << Bits) ==
17654 0) {
17655 SDNode *OtherAdd = U;
17656 for (SDNode *V : OtherAdd->users()) {
17657 if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17658 V->getConstantOperandVal(Num: 0) == IID) {
17659 return SDValue(V, 0);
17660 }
17661 }
17662 }
17663 }
17664 }
17665 }
17666
17667 // Combine vmaxsw/h/b(a, a's negation) to abs(a)
17668 // Expose the vabsduw/h/b opportunity for down stream
17669 if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
17670 (IID == Intrinsic::ppc_altivec_vmaxsw ||
17671 IID == Intrinsic::ppc_altivec_vmaxsh ||
17672 IID == Intrinsic::ppc_altivec_vmaxsb)) {
17673 SDValue V1 = N->getOperand(Num: 1);
17674 SDValue V2 = N->getOperand(Num: 2);
17675 if ((V1.getSimpleValueType() == MVT::v4i32 ||
17676 V1.getSimpleValueType() == MVT::v8i16 ||
17677 V1.getSimpleValueType() == MVT::v16i8) &&
17678 V1.getSimpleValueType() == V2.getSimpleValueType()) {
17679 // (0-a, a)
17680 if (V1.getOpcode() == ISD::SUB &&
17681 ISD::isBuildVectorAllZeros(N: V1.getOperand(i: 0).getNode()) &&
17682 V1.getOperand(i: 1) == V2) {
17683 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V2.getValueType(), Operand: V2);
17684 }
17685 // (a, 0-a)
17686 if (V2.getOpcode() == ISD::SUB &&
17687 ISD::isBuildVectorAllZeros(N: V2.getOperand(i: 0).getNode()) &&
17688 V2.getOperand(i: 1) == V1) {
17689 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V1.getValueType(), Operand: V1);
17690 }
17691 // (x-y, y-x)
17692 if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
17693 V1.getOperand(i: 0) == V2.getOperand(i: 1) &&
17694 V1.getOperand(i: 1) == V2.getOperand(i: 0)) {
17695 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: V1.getValueType(), Operand: V1);
17696 }
17697 }
17698 }
17699 }
17700
17701 break;
17702 case ISD::INTRINSIC_W_CHAIN:
17703 switch (N->getConstantOperandVal(Num: 1)) {
17704 default:
17705 break;
17706 case Intrinsic::ppc_altivec_vsum4sbs:
17707 case Intrinsic::ppc_altivec_vsum4shs:
17708 case Intrinsic::ppc_altivec_vsum4ubs: {
17709 // These sum-across intrinsics only have a chain due to the side effect
17710 // that they may set the SAT bit. If we know the SAT bit will not be set
17711 // for some inputs, we can replace any uses of their chain with the
17712 // input chain.
17713 if (BuildVectorSDNode *BVN =
17714 dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 3))) {
17715 APInt APSplatBits, APSplatUndef;
17716 unsigned SplatBitSize;
17717 bool HasAnyUndefs;
17718 bool BVNIsConstantSplat = BVN->isConstantSplat(
17719 SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize, HasAnyUndefs, MinSplatBits: 0,
17720 isBigEndian: !Subtarget.isLittleEndian());
17721 // If the constant splat vector is 0, the SAT bit will not be set.
17722 if (BVNIsConstantSplat && APSplatBits == 0)
17723 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 1), To: N->getOperand(Num: 0));
17724 }
17725 return SDValue();
17726 }
17727 case Intrinsic::ppc_vsx_lxvw4x:
17728 case Intrinsic::ppc_vsx_lxvd2x:
17729 // For little endian, VSX loads require generating lxvd2x/xxswapd.
17730 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
17731 if (Subtarget.needsSwapsForVSXMemOps())
17732 return expandVSXLoadForLE(N, DCI);
17733 break;
17734 }
17735 break;
17736 case ISD::INTRINSIC_VOID:
17737 // For little endian, VSX stores require generating xxswapd/stxvd2x.
17738 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
17739 if (Subtarget.needsSwapsForVSXMemOps()) {
17740 switch (N->getConstantOperandVal(Num: 1)) {
17741 default:
17742 break;
17743 case Intrinsic::ppc_vsx_stxvw4x:
17744 case Intrinsic::ppc_vsx_stxvd2x:
17745 return expandVSXStoreForLE(N, DCI);
17746 }
17747 }
17748 break;
17749 case ISD::BSWAP: {
17750 // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
17751 // For subtargets without LDBRX, we can still do better than the default
17752 // expansion even for 64-bit BSWAP (LOAD).
17753 bool Is64BitBswapOn64BitTgt =
17754 Subtarget.isPPC64() && N->getValueType(ResNo: 0) == MVT::i64;
17755 bool IsSingleUseNormalLd = ISD::isNormalLoad(N: N->getOperand(Num: 0).getNode()) &&
17756 N->getOperand(Num: 0).hasOneUse();
17757 if (IsSingleUseNormalLd &&
17758 (N->getValueType(ResNo: 0) == MVT::i32 || N->getValueType(ResNo: 0) == MVT::i16 ||
17759 (Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
17760 SDValue Load = N->getOperand(Num: 0);
17761 LoadSDNode *LD = cast<LoadSDNode>(Val&: Load);
17762 // Create the byte-swapping load.
17763 SDValue Ops[] = {
17764 LD->getChain(), // Chain
17765 LD->getBasePtr(), // Ptr
17766 DAG.getValueType(N->getValueType(ResNo: 0)) // VT
17767 };
17768 SDValue BSLoad =
17769 DAG.getMemIntrinsicNode(Opcode: PPCISD::LBRX, dl,
17770 VTList: DAG.getVTList(VT1: N->getValueType(ResNo: 0) == MVT::i64 ?
17771 MVT::i64 : MVT::i32, VT2: MVT::Other),
17772 Ops, MemVT: LD->getMemoryVT(), MMO: LD->getMemOperand());
17773
17774 // If this is an i16 load, insert the truncate.
17775 SDValue ResVal = BSLoad;
17776 if (N->getValueType(ResNo: 0) == MVT::i16)
17777 ResVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i16, Operand: BSLoad);
17778
17779 // First, combine the bswap away. This makes the value produced by the
17780 // load dead.
17781 DCI.CombineTo(N, Res: ResVal);
17782
17783 // Next, combine the load away, we give it a bogus result value but a real
17784 // chain result. The result value is dead because the bswap is dead.
17785 DCI.CombineTo(N: Load.getNode(), Res0: ResVal, Res1: BSLoad.getValue(R: 1));
17786
17787 // Return N so it doesn't get rechecked!
17788 return SDValue(N, 0);
17789 }
17790 // Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
17791 // before legalization so that the BUILD_PAIR is handled correctly.
17792 if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
17793 !IsSingleUseNormalLd)
17794 return SDValue();
17795 LoadSDNode *LD = cast<LoadSDNode>(Val: N->getOperand(Num: 0));
17796
17797 // Can't split volatile or atomic loads.
17798 if (!LD->isSimple())
17799 return SDValue();
17800 SDValue BasePtr = LD->getBasePtr();
17801 SDValue Lo = DAG.getLoad(VT: MVT::i32, dl, Chain: LD->getChain(), Ptr: BasePtr,
17802 PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign());
17803 Lo = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::i32, Operand: Lo);
17804 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: BasePtr.getValueType(), N1: BasePtr,
17805 N2: DAG.getIntPtrConstant(Val: 4, DL: dl));
17806 MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(
17807 MMO: LD->getMemOperand(), Offset: 4, Size: 4);
17808 SDValue Hi = DAG.getLoad(VT: MVT::i32, dl, Chain: LD->getChain(), Ptr: BasePtr, MMO: NewMMO);
17809 Hi = DAG.getNode(Opcode: ISD::BSWAP, DL: dl, VT: MVT::i32, Operand: Hi);
17810 SDValue Res;
17811 if (Subtarget.isLittleEndian())
17812 Res = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Hi, N2: Lo);
17813 else
17814 Res = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT: MVT::i64, N1: Lo, N2: Hi);
17815 SDValue TF =
17816 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other,
17817 N1: Hi.getOperand(i: 0).getValue(R: 1), N2: Lo.getOperand(i: 0).getValue(R: 1));
17818 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: TF);
17819 return Res;
17820 }
17821 case PPCISD::VCMP:
17822 // If a VCMP_rec node already exists with exactly the same operands as this
17823 // node, use its result instead of this node (VCMP_rec computes both a CR6
17824 // and a normal output).
17825 //
17826 if (!N->getOperand(Num: 0).hasOneUse() &&
17827 !N->getOperand(Num: 1).hasOneUse() &&
17828 !N->getOperand(Num: 2).hasOneUse()) {
17829
17830 // Scan all of the users of the LHS, looking for VCMP_rec's that match.
17831 SDNode *VCMPrecNode = nullptr;
17832
17833 SDNode *LHSN = N->getOperand(Num: 0).getNode();
17834 for (SDNode *User : LHSN->users())
17835 if (User->getOpcode() == PPCISD::VCMP_rec &&
17836 User->getOperand(Num: 1) == N->getOperand(Num: 1) &&
17837 User->getOperand(Num: 2) == N->getOperand(Num: 2) &&
17838 User->getOperand(Num: 0) == N->getOperand(Num: 0)) {
17839 VCMPrecNode = User;
17840 break;
17841 }
17842
17843 // If there is no VCMP_rec node, or if the flag value has a single use,
17844 // don't transform this.
17845 if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(NUses: 0, Value: 1))
17846 break;
17847
17848 // Look at the (necessarily single) use of the flag value. If it has a
17849 // chain, this transformation is more complex. Note that multiple things
17850 // could use the value result, which we should ignore.
17851 SDNode *FlagUser = nullptr;
17852 for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
17853 FlagUser == nullptr; ++UI) {
17854 assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
17855 SDNode *User = UI->getUser();
17856 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
17857 if (User->getOperand(Num: i) == SDValue(VCMPrecNode, 1)) {
17858 FlagUser = User;
17859 break;
17860 }
17861 }
17862 }
17863
17864 // If the user is a MFOCRF instruction, we know this is safe.
17865 // Otherwise we give up for right now.
17866 if (FlagUser->getOpcode() == PPCISD::MFOCRF)
17867 return SDValue(VCMPrecNode, 0);
17868 }
17869 break;
17870 case ISD::BR_CC: {
17871 // If this is a branch on an altivec predicate comparison, lower this so
17872 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This
17873 // lowering is done pre-legalize, because the legalizer lowers the predicate
17874 // compare down to code that is difficult to reassemble.
17875 // This code also handles branches that depend on the result of a store
17876 // conditional.
17877 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 1))->get();
17878 SDValue LHS = N->getOperand(Num: 2), RHS = N->getOperand(Num: 3);
17879
17880 int CompareOpc;
17881 bool isDot;
17882
17883 if (!isa<ConstantSDNode>(Val: RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE))
17884 break;
17885
17886 // Since we are doing this pre-legalize, the RHS can be a constant of
17887 // arbitrary bitwidth which may cause issues when trying to get the value
17888 // from the underlying APInt.
17889 auto RHSAPInt = RHS->getAsAPIntVal();
17890 if (!RHSAPInt.isIntN(N: 64))
17891 break;
17892
17893 unsigned Val = RHSAPInt.getZExtValue();
17894 auto isImpossibleCompare = [&]() {
17895 // If this is a comparison against something other than 0/1, then we know
17896 // that the condition is never/always true.
17897 if (Val != 0 && Val != 1) {
17898 if (CC == ISD::SETEQ) // Cond never true, remove branch.
17899 return N->getOperand(Num: 0);
17900 // Always !=, turn it into an unconditional branch.
17901 return DAG.getNode(Opcode: ISD::BR, DL: dl, VT: MVT::Other,
17902 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 4));
17903 }
17904 return SDValue();
17905 };
17906 // Combine branches fed by store conditional instructions (st[bhwd]cx).
17907 unsigned StoreWidth = 0;
17908 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
17909 isStoreConditional(Intrin: LHS, StoreWidth)) {
17910 if (SDValue Impossible = isImpossibleCompare())
17911 return Impossible;
17912 PPC::Predicate CompOpc;
17913 // eq 0 => ne
17914 // ne 0 => eq
17915 // eq 1 => eq
17916 // ne 1 => ne
17917 if (Val == 0)
17918 CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ;
17919 else
17920 CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE;
17921
17922 SDValue Ops[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 2), LHS.getOperand(i: 3),
17923 DAG.getConstant(Val: StoreWidth, DL: dl, VT: MVT::i32)};
17924 auto *MemNode = cast<MemSDNode>(Val&: LHS);
17925 SDValue ConstSt = DAG.getMemIntrinsicNode(
17926 Opcode: PPCISD::STORE_COND, dl,
17927 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other, VT3: MVT::Glue), Ops,
17928 MemVT: MemNode->getMemoryVT(), MMO: MemNode->getMemOperand());
17929
17930 SDValue InChain;
17931 // Unchain the branch from the original store conditional.
17932 if (N->getOperand(Num: 0) == LHS.getValue(R: 1))
17933 InChain = LHS.getOperand(i: 0);
17934 else if (N->getOperand(Num: 0).getOpcode() == ISD::TokenFactor) {
17935 SmallVector<SDValue, 4> InChains;
17936 SDValue InTF = N->getOperand(Num: 0);
17937 for (int i = 0, e = InTF.getNumOperands(); i < e; i++)
17938 if (InTF.getOperand(i) != LHS.getValue(R: 1))
17939 InChains.push_back(Elt: InTF.getOperand(i));
17940 InChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: InChains);
17941 }
17942
17943 return DAG.getNode(Opcode: PPCISD::COND_BRANCH, DL: dl, VT: MVT::Other, N1: InChain,
17944 N2: DAG.getConstant(Val: CompOpc, DL: dl, VT: MVT::i32),
17945 N3: DAG.getRegister(Reg: PPC::CR0, VT: MVT::i32), N4: N->getOperand(Num: 4),
17946 N5: ConstSt.getValue(R: 2));
17947 }
17948
17949 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17950 getVectorCompareInfo(Intrin: LHS, CompareOpc, isDot, Subtarget)) {
17951 assert(isDot && "Can't compare against a vector result!");
17952
17953 if (SDValue Impossible = isImpossibleCompare())
17954 return Impossible;
17955
17956 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
17957 // Create the PPCISD altivec 'dot' comparison node.
17958 SDValue Ops[] = {
17959 LHS.getOperand(i: 2), // LHS of compare
17960 LHS.getOperand(i: 3), // RHS of compare
17961 DAG.getConstant(Val: CompareOpc, DL: dl, VT: MVT::i32)
17962 };
17963 EVT VTs[] = { LHS.getOperand(i: 2).getValueType(), MVT::Glue };
17964 SDValue CompNode = DAG.getNode(Opcode: PPCISD::VCMP_rec, DL: dl, ResultTys: VTs, Ops);
17965
17966 // Unpack the result based on how the target uses it.
17967 PPC::Predicate CompOpc;
17968 switch (LHS.getConstantOperandVal(i: 1)) {
17969 default: // Can't happen, don't crash on invalid number though.
17970 case 0: // Branch on the value of the EQ bit of CR6.
17971 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
17972 break;
17973 case 1: // Branch on the inverted value of the EQ bit of CR6.
17974 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
17975 break;
17976 case 2: // Branch on the value of the LT bit of CR6.
17977 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
17978 break;
17979 case 3: // Branch on the inverted value of the LT bit of CR6.
17980 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
17981 break;
17982 }
17983
17984 return DAG.getNode(Opcode: PPCISD::COND_BRANCH, DL: dl, VT: MVT::Other, N1: N->getOperand(Num: 0),
17985 N2: DAG.getConstant(Val: CompOpc, DL: dl, VT: MVT::i32),
17986 N3: DAG.getRegister(Reg: PPC::CR6, VT: MVT::i32),
17987 N4: N->getOperand(Num: 4), N5: CompNode.getValue(R: 1));
17988 }
17989 break;
17990 }
17991 case ISD::BUILD_VECTOR:
17992 return DAGCombineBuildVector(N, DCI);
17993 case PPCISD::ADDC:
17994 return DAGCombineAddc(N, DCI);
17995 }
17996
17997 return SDValue();
17998}
17999
18000SDValue
18001PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18002 SelectionDAG &DAG,
18003 SmallVectorImpl<SDNode *> &Created) const {
18004 // fold (sdiv X, pow2)
18005 EVT VT = N->getValueType(ResNo: 0);
18006 if (VT == MVT::i64 && !Subtarget.isPPC64())
18007 return SDValue();
18008 if ((VT != MVT::i32 && VT != MVT::i64) ||
18009 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18010 return SDValue();
18011
18012 SDLoc DL(N);
18013 SDValue N0 = N->getOperand(Num: 0);
18014
18015 bool IsNegPow2 = Divisor.isNegatedPowerOf2();
18016 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero();
18017 SDValue ShiftAmt = DAG.getConstant(Val: Lg2, DL, VT);
18018
18019 SDValue Op = DAG.getNode(Opcode: PPCISD::SRA_ADDZE, DL, VT, N1: N0, N2: ShiftAmt);
18020 Created.push_back(Elt: Op.getNode());
18021
18022 if (IsNegPow2) {
18023 Op = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Op);
18024 Created.push_back(Elt: Op.getNode());
18025 }
18026
18027 return Op;
18028}
18029
18030//===----------------------------------------------------------------------===//
18031// Inline Assembly Support
18032//===----------------------------------------------------------------------===//
18033
18034void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18035 KnownBits &Known,
18036 const APInt &DemandedElts,
18037 const SelectionDAG &DAG,
18038 unsigned Depth) const {
18039 Known.resetAll();
18040 switch (Op.getOpcode()) {
18041 default: break;
18042 case PPCISD::LBRX: {
18043 // lhbrx is known to have the top bits cleared out.
18044 if (cast<VTSDNode>(Val: Op.getOperand(i: 2))->getVT() == MVT::i16)
18045 Known.Zero = 0xFFFF0000;
18046 break;
18047 }
18048 case PPCISD::ADDE: {
18049 if (Op.getResNo() == 0) {
18050 // (0|1), _ = ADDE 0, 0, CARRY
18051 SDValue LHS = Op.getOperand(i: 0);
18052 SDValue RHS = Op.getOperand(i: 1);
18053 if (isNullConstant(V: LHS) && isNullConstant(V: RHS))
18054 Known.Zero = ~1ULL;
18055 }
18056 break;
18057 }
18058 case ISD::INTRINSIC_WO_CHAIN: {
18059 switch (Op.getConstantOperandVal(i: 0)) {
18060 default: break;
18061 case Intrinsic::ppc_altivec_vcmpbfp_p:
18062 case Intrinsic::ppc_altivec_vcmpeqfp_p:
18063 case Intrinsic::ppc_altivec_vcmpequb_p:
18064 case Intrinsic::ppc_altivec_vcmpequh_p:
18065 case Intrinsic::ppc_altivec_vcmpequw_p:
18066 case Intrinsic::ppc_altivec_vcmpequd_p:
18067 case Intrinsic::ppc_altivec_vcmpequq_p:
18068 case Intrinsic::ppc_altivec_vcmpgefp_p:
18069 case Intrinsic::ppc_altivec_vcmpgtfp_p:
18070 case Intrinsic::ppc_altivec_vcmpgtsb_p:
18071 case Intrinsic::ppc_altivec_vcmpgtsh_p:
18072 case Intrinsic::ppc_altivec_vcmpgtsw_p:
18073 case Intrinsic::ppc_altivec_vcmpgtsd_p:
18074 case Intrinsic::ppc_altivec_vcmpgtsq_p:
18075 case Intrinsic::ppc_altivec_vcmpgtub_p:
18076 case Intrinsic::ppc_altivec_vcmpgtuh_p:
18077 case Intrinsic::ppc_altivec_vcmpgtuw_p:
18078 case Intrinsic::ppc_altivec_vcmpgtud_p:
18079 case Intrinsic::ppc_altivec_vcmpgtuq_p:
18080 Known.Zero = ~1U; // All bits but the low one are known to be zero.
18081 break;
18082 }
18083 break;
18084 }
18085 case ISD::INTRINSIC_W_CHAIN: {
18086 switch (Op.getConstantOperandVal(i: 1)) {
18087 default:
18088 break;
18089 case Intrinsic::ppc_load2r:
18090 // Top bits are cleared for load2r (which is the same as lhbrx).
18091 Known.Zero = 0xFFFF0000;
18092 break;
18093 }
18094 break;
18095 }
18096 }
18097}
18098
18099Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
18100 switch (Subtarget.getCPUDirective()) {
18101 default: break;
18102 case PPC::DIR_970:
18103 case PPC::DIR_PWR4:
18104 case PPC::DIR_PWR5:
18105 case PPC::DIR_PWR5X:
18106 case PPC::DIR_PWR6:
18107 case PPC::DIR_PWR6X:
18108 case PPC::DIR_PWR7:
18109 case PPC::DIR_PWR8:
18110 case PPC::DIR_PWR9:
18111 case PPC::DIR_PWR10:
18112 case PPC::DIR_PWR11:
18113 case PPC::DIR_PWR_FUTURE: {
18114 if (!ML)
18115 break;
18116
18117 if (!DisableInnermostLoopAlign32) {
18118 // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
18119 // so that we can decrease cache misses and branch-prediction misses.
18120 // Actual alignment of the loop will depend on the hotness check and other
18121 // logic in alignBlocks.
18122 if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
18123 return Align(32);
18124 }
18125
18126 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
18127
18128 // For small loops (between 5 and 8 instructions), align to a 32-byte
18129 // boundary so that the entire loop fits in one instruction-cache line.
18130 uint64_t LoopSize = 0;
18131 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
18132 for (const MachineInstr &J : **I) {
18133 LoopSize += TII->getInstSizeInBytes(MI: J);
18134 if (LoopSize > 32)
18135 break;
18136 }
18137
18138 if (LoopSize > 16 && LoopSize <= 32)
18139 return Align(32);
18140
18141 break;
18142 }
18143 }
18144
18145 return TargetLowering::getPrefLoopAlignment(ML);
18146}
18147
18148/// getConstraintType - Given a constraint, return the type of
18149/// constraint it is for this target.
18150PPCTargetLowering::ConstraintType
18151PPCTargetLowering::getConstraintType(StringRef Constraint) const {
18152 if (Constraint.size() == 1) {
18153 switch (Constraint[0]) {
18154 default: break;
18155 case 'b':
18156 case 'r':
18157 case 'f':
18158 case 'd':
18159 case 'v':
18160 case 'y':
18161 return C_RegisterClass;
18162 case 'Z':
18163 // FIXME: While Z does indicate a memory constraint, it specifically
18164 // indicates an r+r address (used in conjunction with the 'y' modifier
18165 // in the replacement string). Currently, we're forcing the base
18166 // register to be r0 in the asm printer (which is interpreted as zero)
18167 // and forming the complete address in the second register. This is
18168 // suboptimal.
18169 return C_Memory;
18170 }
18171 } else if (Constraint == "wc") { // individual CR bits.
18172 return C_RegisterClass;
18173 } else if (Constraint == "wa" || Constraint == "wd" ||
18174 Constraint == "wf" || Constraint == "ws" ||
18175 Constraint == "wi" || Constraint == "ww") {
18176 return C_RegisterClass; // VSX registers.
18177 }
18178 return TargetLowering::getConstraintType(Constraint);
18179}
18180
18181/// Examine constraint type and operand type and determine a weight value.
18182/// This object must already have been set up with the operand type
18183/// and the current alternative constraint selected.
18184TargetLowering::ConstraintWeight
18185PPCTargetLowering::getSingleConstraintMatchWeight(
18186 AsmOperandInfo &info, const char *constraint) const {
18187 ConstraintWeight weight = CW_Invalid;
18188 Value *CallOperandVal = info.CallOperandVal;
18189 // If we don't have a value, we can't do a match,
18190 // but allow it at the lowest weight.
18191 if (!CallOperandVal)
18192 return CW_Default;
18193 Type *type = CallOperandVal->getType();
18194
18195 // Look at the constraint type.
18196 if (StringRef(constraint) == "wc" && type->isIntegerTy(Bitwidth: 1))
18197 return CW_Register; // an individual CR bit.
18198 else if ((StringRef(constraint) == "wa" ||
18199 StringRef(constraint) == "wd" ||
18200 StringRef(constraint) == "wf") &&
18201 type->isVectorTy())
18202 return CW_Register;
18203 else if (StringRef(constraint) == "wi" && type->isIntegerTy(Bitwidth: 64))
18204 return CW_Register; // just hold 64-bit integers data.
18205 else if (StringRef(constraint) == "ws" && type->isDoubleTy())
18206 return CW_Register;
18207 else if (StringRef(constraint) == "ww" && type->isFloatTy())
18208 return CW_Register;
18209
18210 switch (*constraint) {
18211 default:
18212 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
18213 break;
18214 case 'b':
18215 if (type->isIntegerTy())
18216 weight = CW_Register;
18217 break;
18218 case 'f':
18219 if (type->isFloatTy())
18220 weight = CW_Register;
18221 break;
18222 case 'd':
18223 if (type->isDoubleTy())
18224 weight = CW_Register;
18225 break;
18226 case 'v':
18227 if (type->isVectorTy())
18228 weight = CW_Register;
18229 break;
18230 case 'y':
18231 weight = CW_Register;
18232 break;
18233 case 'Z':
18234 weight = CW_Memory;
18235 break;
18236 }
18237 return weight;
18238}
18239
18240std::pair<unsigned, const TargetRegisterClass *>
18241PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
18242 StringRef Constraint,
18243 MVT VT) const {
18244 if (Constraint.size() == 1) {
18245 // GCC RS6000 Constraint Letters
18246 switch (Constraint[0]) {
18247 case 'b': // R1-R31
18248 if (VT == MVT::i64 && Subtarget.isPPC64())
18249 return std::make_pair(x: 0U, y: &PPC::G8RC_NOX0RegClass);
18250 return std::make_pair(x: 0U, y: &PPC::GPRC_NOR0RegClass);
18251 case 'r': // R0-R31
18252 if (VT == MVT::i64 && Subtarget.isPPC64())
18253 return std::make_pair(x: 0U, y: &PPC::G8RCRegClass);
18254 return std::make_pair(x: 0U, y: &PPC::GPRCRegClass);
18255 // 'd' and 'f' constraints are both defined to be "the floating point
18256 // registers", where one is for 32-bit and the other for 64-bit. We don't
18257 // really care overly much here so just give them all the same reg classes.
18258 case 'd':
18259 case 'f':
18260 if (Subtarget.hasSPE()) {
18261 if (VT == MVT::f32 || VT == MVT::i32)
18262 return std::make_pair(x: 0U, y: &PPC::GPRCRegClass);
18263 if (VT == MVT::f64 || VT == MVT::i64)
18264 return std::make_pair(x: 0U, y: &PPC::SPERCRegClass);
18265 } else {
18266 if (VT == MVT::f32 || VT == MVT::i32)
18267 return std::make_pair(x: 0U, y: &PPC::F4RCRegClass);
18268 if (VT == MVT::f64 || VT == MVT::i64)
18269 return std::make_pair(x: 0U, y: &PPC::F8RCRegClass);
18270 }
18271 break;
18272 case 'v':
18273 if (Subtarget.hasAltivec() && VT.isVector())
18274 return std::make_pair(x: 0U, y: &PPC::VRRCRegClass);
18275 else if (Subtarget.hasVSX())
18276 // Scalars in Altivec registers only make sense with VSX.
18277 return std::make_pair(x: 0U, y: &PPC::VFRCRegClass);
18278 break;
18279 case 'y': // crrc
18280 return std::make_pair(x: 0U, y: &PPC::CRRCRegClass);
18281 }
18282 } else if (Constraint == "wc" && Subtarget.useCRBits()) {
18283 // An individual CR bit.
18284 return std::make_pair(x: 0U, y: &PPC::CRBITRCRegClass);
18285 } else if ((Constraint == "wa" || Constraint == "wd" ||
18286 Constraint == "wf" || Constraint == "wi") &&
18287 Subtarget.hasVSX()) {
18288 // A VSX register for either a scalar (FP) or vector. There is no
18289 // support for single precision scalars on subtargets prior to Power8.
18290 if (VT.isVector())
18291 return std::make_pair(x: 0U, y: &PPC::VSRCRegClass);
18292 if (VT == MVT::f32 && Subtarget.hasP8Vector())
18293 return std::make_pair(x: 0U, y: &PPC::VSSRCRegClass);
18294 return std::make_pair(x: 0U, y: &PPC::VSFRCRegClass);
18295 } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
18296 if (VT == MVT::f32 && Subtarget.hasP8Vector())
18297 return std::make_pair(x: 0U, y: &PPC::VSSRCRegClass);
18298 else
18299 return std::make_pair(x: 0U, y: &PPC::VSFRCRegClass);
18300 } else if (Constraint == "lr") {
18301 if (VT == MVT::i64)
18302 return std::make_pair(x: 0U, y: &PPC::LR8RCRegClass);
18303 else
18304 return std::make_pair(x: 0U, y: &PPC::LRRCRegClass);
18305 }
18306
18307 // Handle special cases of physical registers that are not properly handled
18308 // by the base class.
18309 if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
18310 // If we name a VSX register, we can't defer to the base class because it
18311 // will not recognize the correct register (their names will be VSL{0-31}
18312 // and V{0-31} so they won't match). So we match them here.
18313 if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
18314 int VSNum = atoi(nptr: Constraint.data() + 3);
18315 assert(VSNum >= 0 && VSNum <= 63 &&
18316 "Attempted to access a vsr out of range");
18317 if (VSNum < 32)
18318 return std::make_pair(x: PPC::VSL0 + VSNum, y: &PPC::VSRCRegClass);
18319 return std::make_pair(x: PPC::V0 + VSNum - 32, y: &PPC::VSRCRegClass);
18320 }
18321
18322 // For float registers, we can't defer to the base class as it will match
18323 // the SPILLTOVSRRC class.
18324 if (Constraint.size() > 3 && Constraint[1] == 'f') {
18325 int RegNum = atoi(nptr: Constraint.data() + 2);
18326 if (RegNum > 31 || RegNum < 0)
18327 report_fatal_error(reason: "Invalid floating point register number");
18328 if (VT == MVT::f32 || VT == MVT::i32)
18329 return Subtarget.hasSPE()
18330 ? std::make_pair(x: PPC::R0 + RegNum, y: &PPC::GPRCRegClass)
18331 : std::make_pair(x: PPC::F0 + RegNum, y: &PPC::F4RCRegClass);
18332 if (VT == MVT::f64 || VT == MVT::i64)
18333 return Subtarget.hasSPE()
18334 ? std::make_pair(x: PPC::S0 + RegNum, y: &PPC::SPERCRegClass)
18335 : std::make_pair(x: PPC::F0 + RegNum, y: &PPC::F8RCRegClass);
18336 }
18337 }
18338
18339 std::pair<unsigned, const TargetRegisterClass *> R =
18340 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18341
18342 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
18343 // (which we call X[0-9]+). If a 64-bit value has been requested, and a
18344 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
18345 // register.
18346 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
18347 // the AsmName field from *RegisterInfo.td, then this would not be necessary.
18348 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
18349 PPC::GPRCRegClass.contains(Reg: R.first))
18350 return std::make_pair(x: TRI->getMatchingSuperReg(Reg: R.first,
18351 SubIdx: PPC::sub_32, RC: &PPC::G8RCRegClass),
18352 y: &PPC::G8RCRegClass);
18353
18354 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
18355 if (!R.second && StringRef("{cc}").equals_insensitive(RHS: Constraint)) {
18356 R.first = PPC::CR0;
18357 R.second = &PPC::CRRCRegClass;
18358 }
18359 // FIXME: This warning should ideally be emitted in the front end.
18360 const auto &TM = getTargetMachine();
18361 if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {
18362 if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||
18363 (R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&
18364 (R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))
18365 errs() << "warning: vector registers 20 to 32 are reserved in the "
18366 "default AIX AltiVec ABI and cannot be used\n";
18367 }
18368
18369 return R;
18370}
18371
18372/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
18373/// vector. If it is invalid, don't add anything to Ops.
18374void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18375 StringRef Constraint,
18376 std::vector<SDValue> &Ops,
18377 SelectionDAG &DAG) const {
18378 SDValue Result;
18379
18380 // Only support length 1 constraints.
18381 if (Constraint.size() > 1)
18382 return;
18383
18384 char Letter = Constraint[0];
18385 switch (Letter) {
18386 default: break;
18387 case 'I':
18388 case 'J':
18389 case 'K':
18390 case 'L':
18391 case 'M':
18392 case 'N':
18393 case 'O':
18394 case 'P': {
18395 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Val&: Op);
18396 if (!CST) return; // Must be an immediate to match.
18397 SDLoc dl(Op);
18398 int64_t Value = CST->getSExtValue();
18399 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
18400 // numbers are printed as such.
18401 switch (Letter) {
18402 default: llvm_unreachable("Unknown constraint letter!");
18403 case 'I': // "I" is a signed 16-bit constant.
18404 if (isInt<16>(x: Value))
18405 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18406 break;
18407 case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
18408 if (isShiftedUInt<16, 16>(x: Value))
18409 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18410 break;
18411 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
18412 if (isShiftedInt<16, 16>(x: Value))
18413 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18414 break;
18415 case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
18416 if (isUInt<16>(x: Value))
18417 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18418 break;
18419 case 'M': // "M" is a constant that is greater than 31.
18420 if (Value > 31)
18421 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18422 break;
18423 case 'N': // "N" is a positive constant that is an exact power of two.
18424 if (Value > 0 && isPowerOf2_64(Value))
18425 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18426 break;
18427 case 'O': // "O" is the constant zero.
18428 if (Value == 0)
18429 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18430 break;
18431 case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
18432 if (isInt<16>(x: -Value))
18433 Result = DAG.getTargetConstant(Val: Value, DL: dl, VT: TCVT);
18434 break;
18435 }
18436 break;
18437 }
18438 }
18439
18440 if (Result.getNode()) {
18441 Ops.push_back(x: Result);
18442 return;
18443 }
18444
18445 // Handle standard constraint letters.
18446 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18447}
18448
18449void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,
18450 SmallVectorImpl<SDValue> &Ops,
18451 SelectionDAG &DAG) const {
18452 if (I.getNumOperands() <= 1)
18453 return;
18454 if (!isa<ConstantSDNode>(Val: Ops[1].getNode()))
18455 return;
18456 auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();
18457 if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&
18458 IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)
18459 return;
18460
18461 if (MDNode *MDN = I.getMetadata(KindID: LLVMContext::MD_annotation))
18462 Ops.push_back(Elt: DAG.getMDNode(MD: MDN));
18463}
18464
18465// isLegalAddressingMode - Return true if the addressing mode represented
18466// by AM is legal for this target, for a load/store of the specified type.
18467bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
18468 const AddrMode &AM, Type *Ty,
18469 unsigned AS,
18470 Instruction *I) const {
18471 // Vector type r+i form is supported since power9 as DQ form. We don't check
18472 // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
18473 // imm form is preferred and the offset can be adjusted to use imm form later
18474 // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
18475 // max offset to check legal addressing mode, we should be a little aggressive
18476 // to contain other offsets for that LSRUse.
18477 if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
18478 return false;
18479
18480 // PPC allows a sign-extended 16-bit immediate field.
18481 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
18482 return false;
18483
18484 // No global is ever allowed as a base.
18485 if (AM.BaseGV)
18486 return false;
18487
18488 // PPC only support r+r,
18489 switch (AM.Scale) {
18490 case 0: // "r+i" or just "i", depending on HasBaseReg.
18491 break;
18492 case 1:
18493 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
18494 return false;
18495 // Otherwise we have r+r or r+i.
18496 break;
18497 case 2:
18498 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
18499 return false;
18500 // Allow 2*r as r+r.
18501 break;
18502 default:
18503 // No other scales are supported.
18504 return false;
18505 }
18506
18507 return true;
18508}
18509
18510SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
18511 SelectionDAG &DAG) const {
18512 MachineFunction &MF = DAG.getMachineFunction();
18513 MachineFrameInfo &MFI = MF.getFrameInfo();
18514 MFI.setReturnAddressIsTaken(true);
18515
18516 SDLoc dl(Op);
18517 unsigned Depth = Op.getConstantOperandVal(i: 0);
18518
18519 // Make sure the function does not optimize away the store of the RA to
18520 // the stack.
18521 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
18522 FuncInfo->setLRStoreRequired();
18523 auto PtrVT = getPointerTy(DL: MF.getDataLayout());
18524
18525 if (Depth > 0) {
18526 // The link register (return address) is saved in the caller's frame
18527 // not the callee's stack frame. So we must get the caller's frame
18528 // address and load the return address at the LR offset from there.
18529 SDValue FrameAddr =
18530 DAG.getLoad(VT: Op.getValueType(), dl, Chain: DAG.getEntryNode(),
18531 Ptr: LowerFRAMEADDR(Op, DAG), PtrInfo: MachinePointerInfo());
18532 SDValue Offset =
18533 DAG.getConstant(Val: Subtarget.getFrameLowering()->getReturnSaveOffset(), DL: dl,
18534 VT: Subtarget.getScalarIntVT());
18535 return DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(),
18536 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FrameAddr, N2: Offset),
18537 PtrInfo: MachinePointerInfo());
18538 }
18539
18540 // Just load the return address off the stack.
18541 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
18542 return DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: RetAddrFI,
18543 PtrInfo: MachinePointerInfo());
18544}
18545
18546SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
18547 SelectionDAG &DAG) const {
18548 SDLoc dl(Op);
18549 unsigned Depth = Op.getConstantOperandVal(i: 0);
18550
18551 MachineFunction &MF = DAG.getMachineFunction();
18552 MachineFrameInfo &MFI = MF.getFrameInfo();
18553 MFI.setFrameAddressIsTaken(true);
18554
18555 EVT PtrVT = getPointerTy(DL: MF.getDataLayout());
18556 bool isPPC64 = PtrVT == MVT::i64;
18557
18558 // Naked functions never have a frame pointer, and so we use r1. For all
18559 // other functions, this decision must be delayed until during PEI.
18560 unsigned FrameReg;
18561 if (MF.getFunction().hasFnAttribute(Kind: Attribute::Naked))
18562 FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
18563 else
18564 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
18565
18566 SDValue FrameAddr = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg: FrameReg,
18567 VT: PtrVT);
18568 while (Depth--)
18569 FrameAddr = DAG.getLoad(VT: Op.getValueType(), dl, Chain: DAG.getEntryNode(),
18570 Ptr: FrameAddr, PtrInfo: MachinePointerInfo());
18571 return FrameAddr;
18572}
18573
18574#define GET_REGISTER_MATCHER
18575#include "PPCGenAsmMatcher.inc"
18576
18577Register PPCTargetLowering::getRegisterByName(const char *RegName, LLT VT,
18578 const MachineFunction &MF) const {
18579 bool IsPPC64 = Subtarget.isPPC64();
18580
18581 bool Is64Bit = IsPPC64 && VT == LLT::scalar(SizeInBits: 64);
18582 if (!Is64Bit && VT != LLT::scalar(SizeInBits: 32))
18583 report_fatal_error(reason: "Invalid register global variable type");
18584
18585 Register Reg = MatchRegisterName(Name: RegName);
18586 if (!Reg)
18587 return Reg;
18588
18589 // FIXME: Unable to generate code for `-O2` but okay for `-O0`.
18590 // Need followup investigation as to why.
18591 if ((IsPPC64 && Reg == PPC::R2) || Reg == PPC::R0)
18592 report_fatal_error(reason: Twine("Trying to reserve an invalid register \"" +
18593 StringRef(RegName) + "\"."));
18594
18595 // Convert GPR to GP8R register for 64bit.
18596 if (Is64Bit && StringRef(RegName).starts_with_insensitive(Prefix: "r"))
18597 Reg = Reg.id() - PPC::R0 + PPC::X0;
18598
18599 return Reg;
18600}
18601
18602bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
18603 // 32-bit SVR4 ABI access everything as got-indirect.
18604 if (Subtarget.is32BitELFABI())
18605 return true;
18606
18607 // AIX accesses everything indirectly through the TOC, which is similar to
18608 // the GOT.
18609 if (Subtarget.isAIXABI())
18610 return true;
18611
18612 CodeModel::Model CModel = getTargetMachine().getCodeModel();
18613 // If it is small or large code model, module locals are accessed
18614 // indirectly by loading their address from .toc/.got.
18615 if (CModel == CodeModel::Small || CModel == CodeModel::Large)
18616 return true;
18617
18618 // JumpTable and BlockAddress are accessed as got-indirect.
18619 if (isa<JumpTableSDNode>(Val: GA) || isa<BlockAddressSDNode>(Val: GA))
18620 return true;
18621
18622 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: GA))
18623 return Subtarget.isGVIndirectSymbol(GV: G->getGlobal());
18624
18625 return false;
18626}
18627
18628bool
18629PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
18630 // The PowerPC target isn't yet aware of offsets.
18631 return false;
18632}
18633
18634void PPCTargetLowering::getTgtMemIntrinsic(
18635 SmallVectorImpl<IntrinsicInfo> &Infos, const CallBase &I,
18636 MachineFunction &MF, unsigned Intrinsic) const {
18637 IntrinsicInfo Info;
18638 switch (Intrinsic) {
18639 case Intrinsic::ppc_atomicrmw_xchg_i128:
18640 case Intrinsic::ppc_atomicrmw_add_i128:
18641 case Intrinsic::ppc_atomicrmw_sub_i128:
18642 case Intrinsic::ppc_atomicrmw_nand_i128:
18643 case Intrinsic::ppc_atomicrmw_and_i128:
18644 case Intrinsic::ppc_atomicrmw_or_i128:
18645 case Intrinsic::ppc_atomicrmw_xor_i128:
18646 case Intrinsic::ppc_cmpxchg_i128:
18647 Info.opc = ISD::INTRINSIC_W_CHAIN;
18648 Info.memVT = MVT::i128;
18649 Info.ptrVal = I.getArgOperand(i: 0);
18650 Info.offset = 0;
18651 Info.align = Align(16);
18652 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
18653 MachineMemOperand::MOVolatile;
18654 Infos.push_back(Elt: Info);
18655 return;
18656 case Intrinsic::ppc_atomic_load_i128:
18657 Info.opc = ISD::INTRINSIC_W_CHAIN;
18658 Info.memVT = MVT::i128;
18659 Info.ptrVal = I.getArgOperand(i: 0);
18660 Info.offset = 0;
18661 Info.align = Align(16);
18662 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
18663 Infos.push_back(Elt: Info);
18664 return;
18665 case Intrinsic::ppc_atomic_store_i128:
18666 Info.opc = ISD::INTRINSIC_VOID;
18667 Info.memVT = MVT::i128;
18668 Info.ptrVal = I.getArgOperand(i: 2);
18669 Info.offset = 0;
18670 Info.align = Align(16);
18671 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
18672 Infos.push_back(Elt: Info);
18673 return;
18674 case Intrinsic::ppc_altivec_lvx:
18675 case Intrinsic::ppc_altivec_lvxl:
18676 case Intrinsic::ppc_altivec_lvebx:
18677 case Intrinsic::ppc_altivec_lvehx:
18678 case Intrinsic::ppc_altivec_lvewx:
18679 case Intrinsic::ppc_vsx_lxvd2x:
18680 case Intrinsic::ppc_vsx_lxvw4x:
18681 case Intrinsic::ppc_vsx_lxvd2x_be:
18682 case Intrinsic::ppc_vsx_lxvw4x_be:
18683 case Intrinsic::ppc_vsx_lxvl:
18684 case Intrinsic::ppc_vsx_lxvll: {
18685 EVT VT;
18686 switch (Intrinsic) {
18687 case Intrinsic::ppc_altivec_lvebx:
18688 VT = MVT::i8;
18689 break;
18690 case Intrinsic::ppc_altivec_lvehx:
18691 VT = MVT::i16;
18692 break;
18693 case Intrinsic::ppc_altivec_lvewx:
18694 VT = MVT::i32;
18695 break;
18696 case Intrinsic::ppc_vsx_lxvd2x:
18697 case Intrinsic::ppc_vsx_lxvd2x_be:
18698 VT = MVT::v2f64;
18699 break;
18700 default:
18701 VT = MVT::v4i32;
18702 break;
18703 }
18704
18705 Info.opc = ISD::INTRINSIC_W_CHAIN;
18706 Info.memVT = VT;
18707 Info.ptrVal = I.getArgOperand(i: 0);
18708 Info.offset = -VT.getStoreSize()+1;
18709 Info.size = 2*VT.getStoreSize()-1;
18710 Info.align = Align(1);
18711 Info.flags = MachineMemOperand::MOLoad;
18712 Infos.push_back(Elt: Info);
18713 return;
18714 }
18715 case Intrinsic::ppc_altivec_stvx:
18716 case Intrinsic::ppc_altivec_stvxl:
18717 case Intrinsic::ppc_altivec_stvebx:
18718 case Intrinsic::ppc_altivec_stvehx:
18719 case Intrinsic::ppc_altivec_stvewx:
18720 case Intrinsic::ppc_vsx_stxvd2x:
18721 case Intrinsic::ppc_vsx_stxvw4x:
18722 case Intrinsic::ppc_vsx_stxvd2x_be:
18723 case Intrinsic::ppc_vsx_stxvw4x_be:
18724 case Intrinsic::ppc_vsx_stxvl:
18725 case Intrinsic::ppc_vsx_stxvll: {
18726 EVT VT;
18727 switch (Intrinsic) {
18728 case Intrinsic::ppc_altivec_stvebx:
18729 VT = MVT::i8;
18730 break;
18731 case Intrinsic::ppc_altivec_stvehx:
18732 VT = MVT::i16;
18733 break;
18734 case Intrinsic::ppc_altivec_stvewx:
18735 VT = MVT::i32;
18736 break;
18737 case Intrinsic::ppc_vsx_stxvd2x:
18738 case Intrinsic::ppc_vsx_stxvd2x_be:
18739 VT = MVT::v2f64;
18740 break;
18741 default:
18742 VT = MVT::v4i32;
18743 break;
18744 }
18745
18746 Info.opc = ISD::INTRINSIC_VOID;
18747 Info.memVT = VT;
18748 Info.ptrVal = I.getArgOperand(i: 1);
18749 Info.offset = -VT.getStoreSize()+1;
18750 Info.size = 2*VT.getStoreSize()-1;
18751 Info.align = Align(1);
18752 Info.flags = MachineMemOperand::MOStore;
18753 Infos.push_back(Elt: Info);
18754 return;
18755 }
18756 case Intrinsic::ppc_stdcx:
18757 case Intrinsic::ppc_stwcx:
18758 case Intrinsic::ppc_sthcx:
18759 case Intrinsic::ppc_stbcx: {
18760 EVT VT;
18761 auto Alignment = Align(8);
18762 switch (Intrinsic) {
18763 case Intrinsic::ppc_stdcx:
18764 VT = MVT::i64;
18765 break;
18766 case Intrinsic::ppc_stwcx:
18767 VT = MVT::i32;
18768 Alignment = Align(4);
18769 break;
18770 case Intrinsic::ppc_sthcx:
18771 VT = MVT::i16;
18772 Alignment = Align(2);
18773 break;
18774 case Intrinsic::ppc_stbcx:
18775 VT = MVT::i8;
18776 Alignment = Align(1);
18777 break;
18778 }
18779 Info.opc = ISD::INTRINSIC_W_CHAIN;
18780 Info.memVT = VT;
18781 Info.ptrVal = I.getArgOperand(i: 0);
18782 Info.offset = 0;
18783 Info.align = Alignment;
18784 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
18785 Infos.push_back(Elt: Info);
18786 return;
18787 }
18788 default:
18789 break;
18790 }
18791}
18792
18793/// It returns EVT::Other if the type should be determined using generic
18794/// target-independent logic.
18795EVT PPCTargetLowering::getOptimalMemOpType(
18796 LLVMContext &Context, const MemOp &Op,
18797 const AttributeList &FuncAttributes) const {
18798 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) {
18799 // We should use Altivec/VSX loads and stores when available. For unaligned
18800 // addresses, unaligned VSX loads are only fast starting with the P8.
18801 if (Subtarget.hasAltivec() && Op.size() >= 16) {
18802 if (Op.isMemset() && Subtarget.hasVSX()) {
18803 uint64_t TailSize = Op.size() % 16;
18804 // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
18805 // element if vector element type matches tail store. For tail size
18806 // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
18807 if (TailSize > 2 && TailSize <= 4) {
18808 return MVT::v8i16;
18809 }
18810 return MVT::v4i32;
18811 }
18812 if (Op.isAligned(AlignCheck: Align(16)) || Subtarget.hasP8Vector())
18813 return MVT::v4i32;
18814 }
18815 }
18816
18817 if (Subtarget.isPPC64()) {
18818 return MVT::i64;
18819 }
18820
18821 return MVT::i32;
18822}
18823
18824/// Returns true if it is beneficial to convert a load of a constant
18825/// to just the constant itself.
18826bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
18827 Type *Ty) const {
18828 assert(Ty->isIntegerTy());
18829
18830 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18831 return !(BitSize == 0 || BitSize > 64);
18832}
18833
18834bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
18835 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
18836 return false;
18837 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
18838 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
18839 return NumBits1 == 64 && NumBits2 == 32;
18840}
18841
18842bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
18843 if (!VT1.isInteger() || !VT2.isInteger())
18844 return false;
18845 unsigned NumBits1 = VT1.getSizeInBits();
18846 unsigned NumBits2 = VT2.getSizeInBits();
18847 return NumBits1 == 64 && NumBits2 == 32;
18848}
18849
18850bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
18851 // Generally speaking, zexts are not free, but they are free when they can be
18852 // folded with other operations.
18853 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
18854 EVT MemVT = LD->getMemoryVT();
18855 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
18856 (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
18857 (LD->getExtensionType() == ISD::NON_EXTLOAD ||
18858 LD->getExtensionType() == ISD::ZEXTLOAD))
18859 return true;
18860 }
18861
18862 // FIXME: Add other cases...
18863 // - 32-bit shifts with a zext to i64
18864 // - zext after ctlz, bswap, etc.
18865 // - zext after and by a constant mask
18866
18867 return TargetLowering::isZExtFree(Val, VT2);
18868}
18869
18870bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
18871 assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
18872 "invalid fpext types");
18873 // Extending to float128 is not free.
18874 if (DestVT == MVT::f128)
18875 return false;
18876 return true;
18877}
18878
18879bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
18880 return isInt<16>(x: Imm) || isUInt<16>(x: Imm);
18881}
18882
18883bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
18884 return isInt<16>(x: Imm) || isUInt<16>(x: Imm);
18885}
18886
18887bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
18888 MachineMemOperand::Flags,
18889 unsigned *Fast) const {
18890 if (DisablePPCUnaligned)
18891 return false;
18892
18893 // PowerPC supports unaligned memory access for simple non-vector types.
18894 // Although accessing unaligned addresses is not as efficient as accessing
18895 // aligned addresses, it is generally more efficient than manual expansion,
18896 // and generally only traps for software emulation when crossing page
18897 // boundaries.
18898
18899 if (!VT.isSimple())
18900 return false;
18901
18902 if (VT.isFloatingPoint() && !VT.isVector() &&
18903 !Subtarget.allowsUnalignedFPAccess())
18904 return false;
18905
18906 if (VT.getSimpleVT().isVector()) {
18907 if (Subtarget.hasVSX()) {
18908 if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
18909 VT != MVT::v4f32 && VT != MVT::v4i32)
18910 return false;
18911 } else {
18912 return false;
18913 }
18914 }
18915
18916 if (VT == MVT::ppcf128)
18917 return false;
18918
18919 if (Fast)
18920 *Fast = 1;
18921
18922 return true;
18923}
18924
18925bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
18926 SDValue C) const {
18927 // Check integral scalar types.
18928 if (!VT.isScalarInteger())
18929 return false;
18930 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val: C.getNode())) {
18931 if (!ConstNode->getAPIntValue().isSignedIntN(N: 64))
18932 return false;
18933 // This transformation will generate >= 2 operations. But the following
18934 // cases will generate <= 2 instructions during ISEL. So exclude them.
18935 // 1. If the constant multiplier fits 16 bits, it can be handled by one
18936 // HW instruction, ie. MULLI
18937 // 2. If the multiplier after shifted fits 16 bits, an extra shift
18938 // instruction is needed than case 1, ie. MULLI and RLDICR
18939 int64_t Imm = ConstNode->getSExtValue();
18940 unsigned Shift = llvm::countr_zero<uint64_t>(Val: Imm);
18941 Imm >>= Shift;
18942 if (isInt<16>(x: Imm))
18943 return false;
18944 uint64_t UImm = static_cast<uint64_t>(Imm);
18945 if (isPowerOf2_64(Value: UImm + 1) || isPowerOf2_64(Value: UImm - 1) ||
18946 isPowerOf2_64(Value: 1 - UImm) || isPowerOf2_64(Value: -1 - UImm))
18947 return true;
18948 }
18949 return false;
18950}
18951
18952bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
18953 EVT VT) const {
18954 return isFMAFasterThanFMulAndFAdd(
18955 F: MF.getFunction(), Ty: VT.getTypeForEVT(Context&: MF.getFunction().getContext()));
18956}
18957
18958bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
18959 Type *Ty) const {
18960 if (Subtarget.hasSPE() || Subtarget.useSoftFloat())
18961 return false;
18962 switch (Ty->getScalarType()->getTypeID()) {
18963 case Type::FloatTyID:
18964 case Type::DoubleTyID:
18965 return true;
18966 case Type::FP128TyID:
18967 return Subtarget.hasP9Vector();
18968 default:
18969 return false;
18970 }
18971}
18972
18973// FIXME: add more patterns which are not profitable to hoist.
18974bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
18975 if (!I->hasOneUse())
18976 return true;
18977
18978 Instruction *User = I->user_back();
18979 assert(User && "A single use instruction with no uses.");
18980
18981 switch (I->getOpcode()) {
18982 case Instruction::FMul: {
18983 // Don't break FMA, PowerPC prefers FMA.
18984 if (User->getOpcode() != Instruction::FSub &&
18985 User->getOpcode() != Instruction::FAdd)
18986 return true;
18987
18988 const TargetOptions &Options = getTargetMachine().Options;
18989 const Function *F = I->getFunction();
18990 const DataLayout &DL = F->getDataLayout();
18991 Type *Ty = User->getOperand(i: 0)->getType();
18992 bool AllowContract = I->getFastMathFlags().allowContract() &&
18993 User->getFastMathFlags().allowContract();
18994
18995 return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
18996 isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
18997 (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
18998 }
18999 case Instruction::Load: {
19000 // Don't break "store (load float*)" pattern, this pattern will be combined
19001 // to "store (load int32)" in later InstCombine pass. See function
19002 // combineLoadToOperationType. On PowerPC, loading a float point takes more
19003 // cycles than loading a 32 bit integer.
19004 LoadInst *LI = cast<LoadInst>(Val: I);
19005 // For the loads that combineLoadToOperationType does nothing, like
19006 // ordered load, it should be profitable to hoist them.
19007 // For swifterror load, it can only be used for pointer to pointer type, so
19008 // later type check should get rid of this case.
19009 if (!LI->isUnordered())
19010 return true;
19011
19012 if (User->getOpcode() != Instruction::Store)
19013 return true;
19014
19015 if (I->getType()->getTypeID() != Type::FloatTyID)
19016 return true;
19017
19018 return false;
19019 }
19020 default:
19021 return true;
19022 }
19023 return true;
19024}
19025
19026const MCPhysReg *
19027PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
19028 // LR is a callee-save register, but we must treat it as clobbered by any call
19029 // site. Hence we include LR in the scratch registers, which are in turn added
19030 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
19031 // to CTR, which is used by any indirect call.
19032 static const MCPhysReg ScratchRegs[] = {
19033 PPC::X12, PPC::LR8, PPC::CTR8, 0
19034 };
19035
19036 return ScratchRegs;
19037}
19038
19039Register PPCTargetLowering::getExceptionPointerRegister(
19040 const Constant *PersonalityFn) const {
19041 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
19042}
19043
19044Register PPCTargetLowering::getExceptionSelectorRegister(
19045 const Constant *PersonalityFn) const {
19046 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
19047}
19048
19049bool
19050PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
19051 EVT VT , unsigned DefinedValues) const {
19052 if (VT == MVT::v2i64)
19053 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
19054
19055 if (Subtarget.hasVSX())
19056 return true;
19057
19058 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
19059}
19060
19061Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
19062 if (DisableILPPref || Subtarget.enableMachineScheduler())
19063 return TargetLowering::getSchedulingPreference(N);
19064
19065 return Sched::ILP;
19066}
19067
19068// Create a fast isel object.
19069FastISel *PPCTargetLowering::createFastISel(
19070 FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo,
19071 const LibcallLoweringInfo *LibcallLowering) const {
19072 return PPC::createFastISel(FuncInfo, LibInfo, LibcallLowering);
19073}
19074
19075// 'Inverted' means the FMA opcode after negating one multiplicand.
19076// For example, (fma -a b c) = (fnmsub a b c)
19077static unsigned invertFMAOpcode(unsigned Opc) {
19078 switch (Opc) {
19079 default:
19080 llvm_unreachable("Invalid FMA opcode for PowerPC!");
19081 case ISD::FMA:
19082 return PPCISD::FNMSUB;
19083 case PPCISD::FNMSUB:
19084 return ISD::FMA;
19085 }
19086}
19087
19088SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
19089 bool LegalOps, bool OptForSize,
19090 NegatibleCost &Cost,
19091 unsigned Depth) const {
19092 if (Depth > SelectionDAG::MaxRecursionDepth)
19093 return SDValue();
19094
19095 unsigned Opc = Op.getOpcode();
19096 EVT VT = Op.getValueType();
19097 SDNodeFlags Flags = Op.getNode()->getFlags();
19098
19099 switch (Opc) {
19100 case PPCISD::FNMSUB:
19101 if (!Op.hasOneUse() || !isTypeLegal(VT))
19102 break;
19103
19104 const TargetOptions &Options = getTargetMachine().Options;
19105 SDValue N0 = Op.getOperand(i: 0);
19106 SDValue N1 = Op.getOperand(i: 1);
19107 SDValue N2 = Op.getOperand(i: 2);
19108 SDLoc Loc(Op);
19109
19110 NegatibleCost N2Cost = NegatibleCost::Expensive;
19111 SDValue NegN2 =
19112 getNegatedExpression(Op: N2, DAG, LegalOps, OptForSize, Cost&: N2Cost, Depth: Depth + 1);
19113
19114 if (!NegN2)
19115 return SDValue();
19116
19117 // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
19118 // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
19119 // These transformations may change sign of zeroes. For example,
19120 // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
19121 if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
19122 // Try and choose the cheaper one to negate.
19123 NegatibleCost N0Cost = NegatibleCost::Expensive;
19124 SDValue NegN0 = getNegatedExpression(Op: N0, DAG, LegalOps, OptForSize,
19125 Cost&: N0Cost, Depth: Depth + 1);
19126
19127 NegatibleCost N1Cost = NegatibleCost::Expensive;
19128 SDValue NegN1 = getNegatedExpression(Op: N1, DAG, LegalOps, OptForSize,
19129 Cost&: N1Cost, Depth: Depth + 1);
19130
19131 if (NegN0 && N0Cost <= N1Cost) {
19132 Cost = std::min(a: N0Cost, b: N2Cost);
19133 return DAG.getNode(Opcode: Opc, DL: Loc, VT, N1: NegN0, N2: N1, N3: NegN2, Flags);
19134 } else if (NegN1) {
19135 Cost = std::min(a: N1Cost, b: N2Cost);
19136 return DAG.getNode(Opcode: Opc, DL: Loc, VT, N1: N0, N2: NegN1, N3: NegN2, Flags);
19137 }
19138 }
19139
19140 // (fneg (fnmsub a b c)) => (fma a b (fneg c))
19141 if (isOperationLegal(Op: ISD::FMA, VT)) {
19142 Cost = N2Cost;
19143 return DAG.getNode(Opcode: ISD::FMA, DL: Loc, VT, N1: N0, N2: N1, N3: NegN2, Flags);
19144 }
19145
19146 break;
19147 }
19148
19149 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
19150 Cost, Depth);
19151}
19152
19153// Override to enable LOAD_STACK_GUARD lowering on Linux.
19154bool PPCTargetLowering::useLoadStackGuardNode(const Module &M) const {
19155 if (M.getStackProtectorGuard() == "tls" || Subtarget.isTargetLinux())
19156 return true;
19157 return TargetLowering::useLoadStackGuardNode(M);
19158}
19159
19160bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
19161 bool ForCodeSize) const {
19162 if (!VT.isSimple() || !Subtarget.hasVSX())
19163 return false;
19164
19165 switch(VT.getSimpleVT().SimpleTy) {
19166 default:
19167 // For FP types that are currently not supported by PPC backend, return
19168 // false. Examples: f16, f80.
19169 return false;
19170 case MVT::f32:
19171 case MVT::f64: {
19172 if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
19173 // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
19174 return true;
19175 }
19176 bool IsExact;
19177 APSInt IntResult(16, false);
19178 // The rounding mode doesn't really matter because we only care about floats
19179 // that can be converted to integers exactly.
19180 Imm.convertToInteger(Result&: IntResult, RM: APFloat::rmTowardZero, IsExact: &IsExact);
19181 // For exact values in the range [-16, 15] we can materialize the float.
19182 if (IsExact && IntResult <= 15 && IntResult >= -16)
19183 return true;
19184 return Imm.isZero();
19185 }
19186 case MVT::ppcf128:
19187 return Imm.isPosZero();
19188 }
19189}
19190
19191// For vector shift operation op, fold
19192// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
19193static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
19194 SelectionDAG &DAG) {
19195 SDValue N0 = N->getOperand(Num: 0);
19196 SDValue N1 = N->getOperand(Num: 1);
19197 EVT VT = N0.getValueType();
19198 unsigned OpSizeInBits = VT.getScalarSizeInBits();
19199 unsigned Opcode = N->getOpcode();
19200 unsigned TargetOpcode;
19201
19202 switch (Opcode) {
19203 default:
19204 llvm_unreachable("Unexpected shift operation");
19205 case ISD::SHL:
19206 TargetOpcode = PPCISD::SHL;
19207 break;
19208 case ISD::SRL:
19209 TargetOpcode = PPCISD::SRL;
19210 break;
19211 case ISD::SRA:
19212 TargetOpcode = PPCISD::SRA;
19213 break;
19214 }
19215
19216 if (VT.isVector() && TLI.isOperationLegal(Op: Opcode, VT) &&
19217 N1->getOpcode() == ISD::AND)
19218 if (ConstantSDNode *Mask = isConstOrConstSplat(N: N1->getOperand(Num: 1)))
19219 if (Mask->getZExtValue() == OpSizeInBits - 1)
19220 return DAG.getNode(Opcode: TargetOpcode, DL: SDLoc(N), VT, N1: N0, N2: N1->getOperand(Num: 0));
19221
19222 return SDValue();
19223}
19224
19225SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
19226 DAGCombinerInfo &DCI) const {
19227 EVT VT = N->getValueType(ResNo: 0);
19228 assert(VT.isVector() && "Vector type expected.");
19229
19230 unsigned Opc = N->getOpcode();
19231 assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
19232 "Unexpected opcode.");
19233
19234 if (!isOperationLegal(Op: Opc, VT))
19235 return SDValue();
19236
19237 EVT EltTy = VT.getScalarType();
19238 unsigned EltBits = EltTy.getSizeInBits();
19239 if (EltTy != MVT::i64 && EltTy != MVT::i32)
19240 return SDValue();
19241
19242 SDValue N1 = N->getOperand(Num: 1);
19243 uint64_t SplatBits = 0;
19244 bool AddSplatCase = false;
19245 unsigned OpcN1 = N1.getOpcode();
19246 if (OpcN1 == PPCISD::VADD_SPLAT &&
19247 N1.getConstantOperandVal(i: 1) == VT.getVectorNumElements()) {
19248 AddSplatCase = true;
19249 SplatBits = N1.getConstantOperandVal(i: 0);
19250 }
19251
19252 if (!AddSplatCase) {
19253 if (OpcN1 != ISD::BUILD_VECTOR)
19254 return SDValue();
19255
19256 unsigned SplatBitSize;
19257 bool HasAnyUndefs;
19258 APInt APSplatBits, APSplatUndef;
19259 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val&: N1);
19260 bool BVNIsConstantSplat =
19261 BVN->isConstantSplat(SplatValue&: APSplatBits, SplatUndef&: APSplatUndef, SplatBitSize,
19262 HasAnyUndefs, MinSplatBits: 0, isBigEndian: !Subtarget.isLittleEndian());
19263 if (!BVNIsConstantSplat || SplatBitSize != EltBits)
19264 return SDValue();
19265 SplatBits = APSplatBits.getZExtValue();
19266 }
19267
19268 SDLoc DL(N);
19269 SDValue N0 = N->getOperand(Num: 0);
19270 // PPC vector shifts by word/double look at only the low 5/6 bits of the
19271 // shift vector, which means the max value is 31/63. A shift vector of all
19272 // 1s will be truncated to 31/63, which is useful as vspltiw is limited to
19273 // -16 to 15 range.
19274 if (SplatBits == (EltBits - 1)) {
19275 unsigned NewOpc;
19276 switch (Opc) {
19277 case ISD::SHL:
19278 NewOpc = PPCISD::SHL;
19279 break;
19280 case ISD::SRL:
19281 NewOpc = PPCISD::SRL;
19282 break;
19283 case ISD::SRA:
19284 NewOpc = PPCISD::SRA;
19285 break;
19286 }
19287 SDValue SplatOnes = getCanonicalConstSplat(Val: 255, SplatSize: 1, VT, DAG&: DCI.DAG, dl: DL);
19288 return DCI.DAG.getNode(Opcode: NewOpc, DL, VT, N1: N0, N2: SplatOnes);
19289 }
19290
19291 if (Opc != ISD::SHL || !isOperationLegal(Op: ISD::ADD, VT))
19292 return SDValue();
19293
19294 // For 64-bit there is no splat immediate so we want to catch shift by 1 here
19295 // before the BUILD_VECTOR is replaced by a load.
19296 if (EltTy != MVT::i64 || SplatBits != 1)
19297 return SDValue();
19298
19299 return DCI.DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N), VT, N1: N0, N2: N0);
19300}
19301
19302SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
19303 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19304 return Value;
19305
19306 if (N->getValueType(ResNo: 0).isVector())
19307 return combineVectorShift(N, DCI);
19308
19309 SDValue N0 = N->getOperand(Num: 0);
19310 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
19311 if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
19312 N0.getOpcode() != ISD::SIGN_EXTEND ||
19313 N0.getOperand(i: 0).getValueType() != MVT::i32 || CN1 == nullptr ||
19314 N->getValueType(ResNo: 0) != MVT::i64)
19315 return SDValue();
19316
19317 // We can't save an operation here if the value is already extended, and
19318 // the existing shift is easier to combine.
19319 SDValue ExtsSrc = N0.getOperand(i: 0);
19320 if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
19321 ExtsSrc.getOperand(i: 0).getOpcode() == ISD::AssertSext)
19322 return SDValue();
19323
19324 SDLoc DL(N0);
19325 SDValue ShiftBy = SDValue(CN1, 0);
19326 // We want the shift amount to be i32 on the extswli, but the shift could
19327 // have an i64.
19328 if (ShiftBy.getValueType() == MVT::i64)
19329 ShiftBy = DCI.DAG.getConstant(Val: CN1->getZExtValue(), DL, VT: MVT::i32);
19330
19331 return DCI.DAG.getNode(Opcode: PPCISD::EXTSWSLI, DL, VT: MVT::i64, N1: N0->getOperand(Num: 0),
19332 N2: ShiftBy);
19333}
19334
19335SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
19336 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19337 return Value;
19338
19339 if (N->getValueType(ResNo: 0).isVector())
19340 return combineVectorShift(N, DCI);
19341
19342 return SDValue();
19343}
19344
19345SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
19346 if (auto Value = stripModuloOnShift(TLI: *this, N, DAG&: DCI.DAG))
19347 return Value;
19348
19349 if (N->getValueType(ResNo: 0).isVector())
19350 return combineVectorShift(N, DCI);
19351
19352 return SDValue();
19353}
19354
19355// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
19356// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
19357// When C is zero, the equation (addi Z, -C) can be simplified to Z
19358// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
19359static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
19360 const PPCSubtarget &Subtarget) {
19361 if (!Subtarget.isPPC64())
19362 return SDValue();
19363
19364 SDValue LHS = N->getOperand(Num: 0);
19365 SDValue RHS = N->getOperand(Num: 1);
19366
19367 auto isZextOfCompareWithConstant = [](SDValue Op) {
19368 if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
19369 Op.getValueType() != MVT::i64)
19370 return false;
19371
19372 SDValue Cmp = Op.getOperand(i: 0);
19373 if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
19374 Cmp.getOperand(i: 0).getValueType() != MVT::i64)
19375 return false;
19376
19377 if (auto *Constant = dyn_cast<ConstantSDNode>(Val: Cmp.getOperand(i: 1))) {
19378 int64_t NegConstant = 0 - Constant->getSExtValue();
19379 // Due to the limitations of the addi instruction,
19380 // -C is required to be [-32768, 32767].
19381 return isInt<16>(x: NegConstant);
19382 }
19383
19384 return false;
19385 };
19386
19387 bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
19388 bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
19389
19390 // If there is a pattern, canonicalize a zext operand to the RHS.
19391 if (LHSHasPattern && !RHSHasPattern)
19392 std::swap(a&: LHS, b&: RHS);
19393 else if (!LHSHasPattern && !RHSHasPattern)
19394 return SDValue();
19395
19396 SDLoc DL(N);
19397 EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
19398 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: CarryType);
19399 SDValue Cmp = RHS.getOperand(i: 0);
19400 SDValue Z = Cmp.getOperand(i: 0);
19401 auto *Constant = cast<ConstantSDNode>(Val: Cmp.getOperand(i: 1));
19402 int64_t NegConstant = 0 - Constant->getSExtValue();
19403
19404 switch(cast<CondCodeSDNode>(Val: Cmp.getOperand(i: 2))->get()) {
19405 default: break;
19406 case ISD::SETNE: {
19407 // when C == 0
19408 // --> addze X, (addic Z, -1).carry
19409 // /
19410 // add X, (zext(setne Z, C))--
19411 // \ when -32768 <= -C <= 32767 && C != 0
19412 // --> addze X, (addic (addi Z, -C), -1).carry
19413 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Z,
19414 N2: DAG.getConstant(Val: NegConstant, DL, VT: MVT::i64));
19415 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
19416 SDValue Addc =
19417 DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: CarryType),
19418 N1: AddOrZ, N2: DAG.getAllOnesConstant(DL, VT: MVT::i64),
19419 N3: DAG.getConstant(Val: 0, DL, VT: CarryType));
19420 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: VTs, N1: LHS,
19421 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64),
19422 N3: SDValue(Addc.getNode(), 1));
19423 }
19424 case ISD::SETEQ: {
19425 // when C == 0
19426 // --> addze X, (subfic Z, 0).carry
19427 // /
19428 // add X, (zext(sete Z, C))--
19429 // \ when -32768 <= -C <= 32767 && C != 0
19430 // --> addze X, (subfic (addi Z, -C), 0).carry
19431 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Z,
19432 N2: DAG.getConstant(Val: NegConstant, DL, VT: MVT::i64));
19433 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
19434 SDValue Subc =
19435 DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: CarryType),
19436 N1: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N2: AddOrZ,
19437 N3: DAG.getConstant(Val: 0, DL, VT: CarryType));
19438 SDValue Invert = DAG.getNode(Opcode: ISD::XOR, DL, VT: CarryType, N1: Subc.getValue(R: 1),
19439 N2: DAG.getConstant(Val: 1UL, DL, VT: CarryType));
19440 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: VTs, N1: LHS,
19441 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N3: Invert);
19442 }
19443 }
19444
19445 return SDValue();
19446}
19447
19448// Transform
19449// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
19450// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
19451// In this case both C1 and C2 must be known constants.
19452// C1+C2 must fit into a 34 bit signed integer.
19453static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
19454 const PPCSubtarget &Subtarget) {
19455 if (!Subtarget.isUsingPCRelativeCalls())
19456 return SDValue();
19457
19458 // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
19459 // If we find that node try to cast the Global Address and the Constant.
19460 SDValue LHS = N->getOperand(Num: 0);
19461 SDValue RHS = N->getOperand(Num: 1);
19462
19463 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
19464 std::swap(a&: LHS, b&: RHS);
19465
19466 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
19467 return SDValue();
19468
19469 // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
19470 GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(Val: LHS.getOperand(i: 0));
19471 ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(Val&: RHS);
19472
19473 // Check that both casts succeeded.
19474 if (!GSDN || !ConstNode)
19475 return SDValue();
19476
19477 int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
19478 SDLoc DL(GSDN);
19479
19480 // The signed int offset needs to fit in 34 bits.
19481 if (!isInt<34>(x: NewOffset))
19482 return SDValue();
19483
19484 // The new global address is a copy of the old global address except
19485 // that it has the updated Offset.
19486 SDValue GA =
19487 DAG.getTargetGlobalAddress(GV: GSDN->getGlobal(), DL, VT: GSDN->getValueType(ResNo: 0),
19488 offset: NewOffset, TargetFlags: GSDN->getTargetFlags());
19489 SDValue MatPCRel =
19490 DAG.getNode(Opcode: PPCISD::MAT_PCREL_ADDR, DL, VT: GSDN->getValueType(ResNo: 0), Operand: GA);
19491 return MatPCRel;
19492}
19493
19494// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes))
19495// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv
19496// Mathematical identity: X + 1 = X - (-1)
19497// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1
19498// Requirement: VSX feature for efficient xxleqv generation
19499static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
19500 const PPCSubtarget &Subtarget) {
19501
19502 EVT VT = N->getValueType(ResNo: 0);
19503 if (!Subtarget.hasVSX())
19504 return SDValue();
19505
19506 // Handle v2i64, v4i32, v8i16 and v16i8 types
19507 if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
19508 VT == MVT::v2i64))
19509 return SDValue();
19510
19511 SDValue LHS = N->getOperand(Num: 0);
19512 SDValue RHS = N->getOperand(Num: 1);
19513
19514 // Check if RHS is BUILD_VECTOR
19515 if (RHS.getOpcode() != ISD::BUILD_VECTOR)
19516 return SDValue();
19517
19518 // Check if all the elements are 1
19519 unsigned NumOfEles = RHS.getNumOperands();
19520 for (unsigned i = 0; i < NumOfEles; ++i) {
19521 auto *CN = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i));
19522 if (!CN || CN->getSExtValue() != 1)
19523 return SDValue();
19524 }
19525 SDLoc DL(N);
19526
19527 SDValue MinusOne = DAG.getConstant(Val: APInt::getAllOnes(numBits: 32), DL, VT: MVT::i32);
19528 SmallVector<SDValue, 4> Ops(4, MinusOne);
19529 SDValue AllOnesVec = DAG.getBuildVector(VT: MVT::v4i32, DL, Ops);
19530
19531 // Bitcast to the target vector type
19532 SDValue Bitcast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: AllOnesVec);
19533
19534 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Bitcast);
19535}
19536
19537SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
19538 if (auto Value = combineADDToADDZE(N, DAG&: DCI.DAG, Subtarget))
19539 return Value;
19540
19541 if (auto Value = combineADDToMAT_PCREL_ADDR(N, DAG&: DCI.DAG, Subtarget))
19542 return Value;
19543
19544 if (auto Value = combineADDToSUB(N, DAG&: DCI.DAG, Subtarget))
19545 return Value;
19546 return SDValue();
19547}
19548
19549// Detect TRUNCATE operations on bitcasts of float128 values.
19550// What we are looking for here is the situtation where we extract a subset
19551// of bits from a 128 bit float.
19552// This can be of two forms:
19553// 1) BITCAST of f128 feeding TRUNCATE
19554// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
19555// The reason this is required is because we do not have a legal i128 type
19556// and so we want to prevent having to store the f128 and then reload part
19557// of it.
19558SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
19559 DAGCombinerInfo &DCI) const {
19560 // If we are using CRBits then try that first.
19561 if (Subtarget.useCRBits()) {
19562 // Check if CRBits did anything and return that if it did.
19563 if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
19564 return CRTruncValue;
19565 }
19566
19567 SDLoc dl(N);
19568 SDValue Op0 = N->getOperand(Num: 0);
19569
19570 // Looking for a truncate of i128 to i64.
19571 if (Op0.getValueType() != MVT::i128 || N->getValueType(ResNo: 0) != MVT::i64)
19572 return SDValue();
19573
19574 int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
19575
19576 // SRL feeding TRUNCATE.
19577 if (Op0.getOpcode() == ISD::SRL) {
19578 ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Val: Op0.getOperand(i: 1));
19579 // The right shift has to be by 64 bits.
19580 if (!ConstNode || ConstNode->getZExtValue() != 64)
19581 return SDValue();
19582
19583 // Switch the element number to extract.
19584 EltToExtract = EltToExtract ? 0 : 1;
19585 // Update Op0 past the SRL.
19586 Op0 = Op0.getOperand(i: 0);
19587 }
19588
19589 // BITCAST feeding a TRUNCATE possibly via SRL.
19590 if (Op0.getOpcode() == ISD::BITCAST &&
19591 Op0.getValueType() == MVT::i128 &&
19592 Op0.getOperand(i: 0).getValueType() == MVT::f128) {
19593 SDValue Bitcast = DCI.DAG.getBitcast(VT: MVT::v2i64, V: Op0.getOperand(i: 0));
19594 return DCI.DAG.getNode(
19595 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Bitcast,
19596 N2: DCI.DAG.getTargetConstant(Val: EltToExtract, DL: dl, VT: MVT::i32));
19597 }
19598 return SDValue();
19599}
19600
19601SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
19602 SelectionDAG &DAG = DCI.DAG;
19603
19604 ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N: N->getOperand(Num: 1));
19605 if (!ConstOpOrElement)
19606 return SDValue();
19607
19608 // An imul is usually smaller than the alternative sequence for legal type.
19609 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
19610 isOperationLegal(Op: ISD::MUL, VT: N->getValueType(ResNo: 0)))
19611 return SDValue();
19612
19613 auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
19614 switch (this->Subtarget.getCPUDirective()) {
19615 default:
19616 // TODO: enhance the condition for subtarget before pwr8
19617 return false;
19618 case PPC::DIR_PWR8:
19619 // type mul add shl
19620 // scalar 4 1 1
19621 // vector 7 2 2
19622 return true;
19623 case PPC::DIR_PWR9:
19624 case PPC::DIR_PWR10:
19625 case PPC::DIR_PWR11:
19626 case PPC::DIR_PWR_FUTURE:
19627 // type mul add shl
19628 // scalar 5 2 2
19629 // vector 7 2 2
19630
19631 // The cycle RATIO of related operations are showed as a table above.
19632 // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
19633 // scalar and vector type. For 2 instrs patterns, add/sub + shl
19634 // are 4, it is always profitable; but for 3 instrs patterns
19635 // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
19636 // So we should only do it for vector type.
19637 return IsAddOne && IsNeg ? VT.isVector() : true;
19638 }
19639 };
19640
19641 EVT VT = N->getValueType(ResNo: 0);
19642 SDLoc DL(N);
19643
19644 const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
19645 bool IsNeg = MulAmt.isNegative();
19646 APInt MulAmtAbs = MulAmt.abs();
19647
19648 if ((MulAmtAbs - 1).isPowerOf2()) {
19649 // (mul x, 2^N + 1) => (add (shl x, N), x)
19650 // (mul x, -(2^N + 1)) => -(add (shl x, N), x)
19651
19652 if (!IsProfitable(IsNeg, true, VT))
19653 return SDValue();
19654
19655 SDValue Op0 = N->getOperand(Num: 0);
19656 SDValue Op1 =
19657 DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N->getOperand(Num: 0),
19658 N2: DAG.getConstant(Val: (MulAmtAbs - 1).logBase2(), DL, VT));
19659 SDValue Res = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Op0, N2: Op1);
19660
19661 if (!IsNeg)
19662 return Res;
19663
19664 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Res);
19665 } else if ((MulAmtAbs + 1).isPowerOf2()) {
19666 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19667 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19668
19669 if (!IsProfitable(IsNeg, false, VT))
19670 return SDValue();
19671
19672 SDValue Op0 = N->getOperand(Num: 0);
19673 SDValue Op1 =
19674 DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N->getOperand(Num: 0),
19675 N2: DAG.getConstant(Val: (MulAmtAbs + 1).logBase2(), DL, VT));
19676
19677 if (!IsNeg)
19678 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Op1, N2: Op0);
19679 else
19680 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Op0, N2: Op1);
19681
19682 } else {
19683 return SDValue();
19684 }
19685}
19686
19687// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
19688// in combiner since we need to check SD flags and other subtarget features.
19689SDValue PPCTargetLowering::combineFMALike(SDNode *N,
19690 DAGCombinerInfo &DCI) const {
19691 SDValue N0 = N->getOperand(Num: 0);
19692 SDValue N1 = N->getOperand(Num: 1);
19693 SDValue N2 = N->getOperand(Num: 2);
19694 SDNodeFlags Flags = N->getFlags();
19695 EVT VT = N->getValueType(ResNo: 0);
19696 SelectionDAG &DAG = DCI.DAG;
19697 const TargetOptions &Options = getTargetMachine().Options;
19698 unsigned Opc = N->getOpcode();
19699 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
19700 bool LegalOps = !DCI.isBeforeLegalizeOps();
19701 SDLoc Loc(N);
19702
19703 if (!isOperationLegal(Op: ISD::FMA, VT))
19704 return SDValue();
19705
19706 // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
19707 // since (fnmsub a b c)=-0 while c-ab=+0.
19708 if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
19709 return SDValue();
19710
19711 // (fma (fneg a) b c) => (fnmsub a b c)
19712 // (fnmsub (fneg a) b c) => (fma a b c)
19713 if (SDValue NegN0 = getCheaperNegatedExpression(Op: N0, DAG, LegalOps, OptForSize: CodeSize))
19714 return DAG.getNode(Opcode: invertFMAOpcode(Opc), DL: Loc, VT, N1: NegN0, N2: N1, N3: N2, Flags);
19715
19716 // (fma a (fneg b) c) => (fnmsub a b c)
19717 // (fnmsub a (fneg b) c) => (fma a b c)
19718 if (SDValue NegN1 = getCheaperNegatedExpression(Op: N1, DAG, LegalOps, OptForSize: CodeSize))
19719 return DAG.getNode(Opcode: invertFMAOpcode(Opc), DL: Loc, VT, N1: N0, N2: NegN1, N3: N2, Flags);
19720
19721 return SDValue();
19722}
19723
19724bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
19725 // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
19726 if (!Subtarget.is64BitELFABI())
19727 return false;
19728
19729 // If not a tail call then no need to proceed.
19730 if (!CI->isTailCall())
19731 return false;
19732
19733 // If sibling calls have been disabled and tail-calls aren't guaranteed
19734 // there is no reason to duplicate.
19735 auto &TM = getTargetMachine();
19736 if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
19737 return false;
19738
19739 // Can't tail call a function called indirectly, or if it has variadic args.
19740 const Function *Callee = CI->getCalledFunction();
19741 if (!Callee || Callee->isVarArg())
19742 return false;
19743
19744 // Make sure the callee and caller calling conventions are eligible for tco.
19745 const Function *Caller = CI->getParent()->getParent();
19746 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC: Caller->getCallingConv(),
19747 CalleeCC: CI->getCallingConv()))
19748 return false;
19749
19750 // If the function is local then we have a good chance at tail-calling it
19751 return getTargetMachine().shouldAssumeDSOLocal(GV: Callee);
19752}
19753
19754bool PPCTargetLowering::
19755isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
19756 const Value *Mask = AndI.getOperand(i: 1);
19757 // If the mask is suitable for andi. or andis. we should sink the and.
19758 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val: Mask)) {
19759 // Can't handle constants wider than 64-bits.
19760 if (CI->getBitWidth() > 64)
19761 return false;
19762 int64_t ConstVal = CI->getZExtValue();
19763 return isUInt<16>(x: ConstVal) ||
19764 (isUInt<16>(x: ConstVal >> 16) && !(ConstVal & 0xFFFF));
19765 }
19766
19767 // For non-constant masks, we can always use the record-form and.
19768 return true;
19769}
19770
19771/// getAddrModeForFlags - Based on the set of address flags, select the most
19772/// optimal instruction format to match by.
19773PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
19774 // This is not a node we should be handling here.
19775 if (Flags == PPC::MOF_None)
19776 return PPC::AM_None;
19777 // Unaligned D-Forms are tried first, followed by the aligned D-Forms.
19778 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DForm))
19779 if ((Flags & FlagSet) == FlagSet)
19780 return PPC::AM_DForm;
19781 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DSForm))
19782 if ((Flags & FlagSet) == FlagSet)
19783 return PPC::AM_DSForm;
19784 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_DQForm))
19785 if ((Flags & FlagSet) == FlagSet)
19786 return PPC::AM_DQForm;
19787 for (auto FlagSet : AddrModesMap.at(k: PPC::AM_PrefixDForm))
19788 if ((Flags & FlagSet) == FlagSet)
19789 return PPC::AM_PrefixDForm;
19790 // If no other forms are selected, return an X-Form as it is the most
19791 // general addressing mode.
19792 return PPC::AM_XForm;
19793}
19794
19795/// Set alignment flags based on whether or not the Frame Index is aligned.
19796/// Utilized when computing flags for address computation when selecting
19797/// load and store instructions.
19798static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,
19799 SelectionDAG &DAG) {
19800 bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));
19801 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: IsAdd ? N.getOperand(i: 0) : N);
19802 if (!FI)
19803 return;
19804 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19805 unsigned FrameIndexAlign = MFI.getObjectAlign(ObjectIdx: FI->getIndex()).value();
19806 // If this is (add $FI, $S16Imm), the alignment flags are already set
19807 // based on the immediate. We just need to clear the alignment flags
19808 // if the FI alignment is weaker.
19809 if ((FrameIndexAlign % 4) != 0)
19810 FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;
19811 if ((FrameIndexAlign % 16) != 0)
19812 FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;
19813 // If the address is a plain FrameIndex, set alignment flags based on
19814 // FI alignment.
19815 if (!IsAdd) {
19816 if ((FrameIndexAlign % 4) == 0)
19817 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
19818 if ((FrameIndexAlign % 16) == 0)
19819 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
19820 }
19821}
19822
19823/// Given a node, compute flags that are used for address computation when
19824/// selecting load and store instructions. The flags computed are stored in
19825/// FlagSet. This function takes into account whether the node is a constant,
19826/// an ADD, OR, or a constant, and computes the address flags accordingly.
19827static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
19828 SelectionDAG &DAG) {
19829 // Set the alignment flags for the node depending on if the node is
19830 // 4-byte or 16-byte aligned.
19831 auto SetAlignFlagsForImm = [&](uint64_t Imm) {
19832 if ((Imm & 0x3) == 0)
19833 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
19834 if ((Imm & 0xf) == 0)
19835 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
19836 };
19837
19838 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: N)) {
19839 // All 32-bit constants can be computed as LIS + Disp.
19840 const APInt &ConstImm = CN->getAPIntValue();
19841 if (ConstImm.isSignedIntN(N: 32)) { // Flag to handle 32-bit constants.
19842 FlagSet |= PPC::MOF_AddrIsSImm32;
19843 SetAlignFlagsForImm(ConstImm.getZExtValue());
19844 setAlignFlagsForFI(N, FlagSet, DAG);
19845 }
19846 if (ConstImm.isSignedIntN(N: 34)) // Flag to handle 34-bit constants.
19847 FlagSet |= PPC::MOF_RPlusSImm34;
19848 else // Let constant materialization handle large constants.
19849 FlagSet |= PPC::MOF_NotAddNorCst;
19850 } else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {
19851 // This address can be represented as an addition of:
19852 // - Register + Imm16 (possibly a multiple of 4/16)
19853 // - Register + Imm34
19854 // - Register + PPCISD::Lo
19855 // - Register + Register
19856 // In any case, we won't have to match this as Base + Zero.
19857 SDValue RHS = N.getOperand(i: 1);
19858 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val&: RHS)) {
19859 const APInt &ConstImm = CN->getAPIntValue();
19860 if (ConstImm.isSignedIntN(N: 16)) {
19861 FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.
19862 SetAlignFlagsForImm(ConstImm.getZExtValue());
19863 setAlignFlagsForFI(N, FlagSet, DAG);
19864 }
19865 if (ConstImm.isSignedIntN(N: 34))
19866 FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
19867 else
19868 FlagSet |= PPC::MOF_RPlusR; // Register.
19869 } else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(i: 1))
19870 FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
19871 else
19872 FlagSet |= PPC::MOF_RPlusR;
19873 } else { // The address computation is not a constant or an addition.
19874 setAlignFlagsForFI(N, FlagSet, DAG);
19875 FlagSet |= PPC::MOF_NotAddNorCst;
19876 }
19877}
19878
19879static bool isPCRelNode(SDValue N) {
19880 return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||
19881 isValidPCRelNode<ConstantPoolSDNode>(N) ||
19882 isValidPCRelNode<GlobalAddressSDNode>(N) ||
19883 isValidPCRelNode<JumpTableSDNode>(N) ||
19884 isValidPCRelNode<BlockAddressSDNode>(N));
19885}
19886
19887/// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
19888/// the address flags of the load/store instruction that is to be matched.
19889unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
19890 SelectionDAG &DAG) const {
19891 unsigned FlagSet = PPC::MOF_None;
19892
19893 // Compute subtarget flags.
19894 if (!Subtarget.hasP9Vector())
19895 FlagSet |= PPC::MOF_SubtargetBeforeP9;
19896 else
19897 FlagSet |= PPC::MOF_SubtargetP9;
19898
19899 if (Subtarget.hasPrefixInstrs())
19900 FlagSet |= PPC::MOF_SubtargetP10;
19901
19902 if (Subtarget.hasSPE())
19903 FlagSet |= PPC::MOF_SubtargetSPE;
19904
19905 // Check if we have a PCRel node and return early.
19906 if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))
19907 return FlagSet;
19908
19909 // If the node is the paired load/store intrinsics, compute flags for
19910 // address computation and return early.
19911 unsigned ParentOp = Parent->getOpcode();
19912 if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
19913 (ParentOp == ISD::INTRINSIC_VOID))) {
19914 unsigned ID = Parent->getConstantOperandVal(Num: 1);
19915 if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
19916 SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
19917 ? Parent->getOperand(Num: 2)
19918 : Parent->getOperand(Num: 3);
19919 computeFlagsForAddressComputation(N: IntrinOp, FlagSet, DAG);
19920 FlagSet |= PPC::MOF_Vector;
19921 return FlagSet;
19922 }
19923 }
19924
19925 // Mark this as something we don't want to handle here if it is atomic
19926 // or pre-increment instruction.
19927 if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Val: Parent))
19928 if (LSB->isIndexed())
19929 return PPC::MOF_None;
19930
19931 // Compute in-memory type flags. This is based on if there are scalars,
19932 // floats or vectors.
19933 const MemSDNode *MN = dyn_cast<MemSDNode>(Val: Parent);
19934 assert(MN && "Parent should be a MemSDNode!");
19935 EVT MemVT = MN->getMemoryVT();
19936 unsigned Size = MemVT.getSizeInBits();
19937 if (MemVT.isScalarInteger()) {
19938 assert(Size <= 128 &&
19939 "Not expecting scalar integers larger than 16 bytes!");
19940 if (Size < 32)
19941 FlagSet |= PPC::MOF_SubWordInt;
19942 else if (Size == 32)
19943 FlagSet |= PPC::MOF_WordInt;
19944 else
19945 FlagSet |= PPC::MOF_DoubleWordInt;
19946 } else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
19947 if (Size == 128)
19948 FlagSet |= PPC::MOF_Vector;
19949 else if (Size == 256) {
19950 assert(Subtarget.pairedVectorMemops() &&
19951 "256-bit vectors are only available when paired vector memops is "
19952 "enabled!");
19953 FlagSet |= PPC::MOF_Vector;
19954 } else
19955 llvm_unreachable("Not expecting illegal vectors!");
19956 } else { // Floating point type: can be scalar, f128 or vector types.
19957 if (Size == 32 || Size == 64)
19958 FlagSet |= PPC::MOF_ScalarFloat;
19959 else if (MemVT == MVT::f128 || MemVT.isVector())
19960 FlagSet |= PPC::MOF_Vector;
19961 else
19962 llvm_unreachable("Not expecting illegal scalar floats!");
19963 }
19964
19965 // Compute flags for address computation.
19966 computeFlagsForAddressComputation(N, FlagSet, DAG);
19967
19968 // Compute type extension flags.
19969 if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Val: Parent)) {
19970 switch (LN->getExtensionType()) {
19971 case ISD::SEXTLOAD:
19972 FlagSet |= PPC::MOF_SExt;
19973 break;
19974 case ISD::EXTLOAD:
19975 case ISD::ZEXTLOAD:
19976 FlagSet |= PPC::MOF_ZExt;
19977 break;
19978 case ISD::NON_EXTLOAD:
19979 FlagSet |= PPC::MOF_NoExt;
19980 break;
19981 }
19982 } else
19983 FlagSet |= PPC::MOF_NoExt;
19984
19985 // For integers, no extension is the same as zero extension.
19986 // We set the extension mode to zero extension so we don't have
19987 // to add separate entries in AddrModesMap for loads and stores.
19988 if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {
19989 FlagSet |= PPC::MOF_ZExt;
19990 FlagSet &= ~PPC::MOF_NoExt;
19991 }
19992
19993 // If we don't have prefixed instructions, 34-bit constants should be
19994 // treated as PPC::MOF_NotAddNorCst so they can match D-Forms.
19995 bool IsNonP1034BitConst =
19996 ((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &
19997 FlagSet) == PPC::MOF_RPlusSImm34;
19998 if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&
19999 IsNonP1034BitConst)
20000 FlagSet |= PPC::MOF_NotAddNorCst;
20001
20002 return FlagSet;
20003}
20004
20005/// SelectForceXFormMode - Given the specified address, force it to be
20006/// represented as an indexed [r+r] operation (an XForm instruction).
20007PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
20008 SDValue &Base,
20009 SelectionDAG &DAG) const {
20010
20011 PPC::AddrMode Mode = PPC::AM_XForm;
20012 int16_t ForceXFormImm = 0;
20013 if (provablyDisjointOr(DAG, N) &&
20014 !isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: ForceXFormImm)) {
20015 Disp = N.getOperand(i: 0);
20016 Base = N.getOperand(i: 1);
20017 return Mode;
20018 }
20019
20020 // If the address is the result of an add, we will utilize the fact that the
20021 // address calculation includes an implicit add. However, we can reduce
20022 // register pressure if we do not materialize a constant just for use as the
20023 // index register. We only get rid of the add if it is not an add of a
20024 // value and a 16-bit signed constant and both have a single use.
20025 if (N.getOpcode() == ISD::ADD &&
20026 (!isIntS16Immediate(Op: N.getOperand(i: 1), Imm&: ForceXFormImm) ||
20027 !N.getOperand(i: 1).hasOneUse() || !N.getOperand(i: 0).hasOneUse())) {
20028 Disp = N.getOperand(i: 0);
20029 Base = N.getOperand(i: 1);
20030 return Mode;
20031 }
20032
20033 // Otherwise, use R0 as the base register.
20034 Disp = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20035 VT: N.getValueType());
20036 Base = N;
20037
20038 return Mode;
20039}
20040
20041bool PPCTargetLowering::splitValueIntoRegisterParts(
20042 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
20043 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
20044 EVT ValVT = Val.getValueType();
20045 // If we are splitting a scalar integer into f64 parts (i.e. so they
20046 // can be placed into VFRC registers), we need to zero extend and
20047 // bitcast the values. This will ensure the value is placed into a
20048 // VSR using direct moves or stack operations as needed.
20049 if (PartVT == MVT::f64 &&
20050 (ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {
20051 Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
20052 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Val);
20053 Parts[0] = Val;
20054 return true;
20055 }
20056 return false;
20057}
20058
20059SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
20060 SelectionDAG &DAG) const {
20061 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20062 TargetLowering::CallLoweringInfo CLI(DAG);
20063 EVT RetVT = Op.getValueType();
20064 Type *RetTy = RetVT.getTypeForEVT(Context&: *DAG.getContext());
20065 SDValue Callee =
20066 DAG.getExternalSymbol(Sym: LibCallName, VT: TLI.getPointerTy(DL: DAG.getDataLayout()));
20067 bool SignExtend = TLI.shouldSignExtendTypeInLibCall(Ty: RetTy, IsSigned: false);
20068 TargetLowering::ArgListTy Args;
20069 for (const SDValue &N : Op->op_values()) {
20070 EVT ArgVT = N.getValueType();
20071 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
20072 TargetLowering::ArgListEntry Entry(N, ArgTy);
20073 Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(Ty: ArgTy, IsSigned: SignExtend);
20074 Entry.IsZExt = !Entry.IsSExt;
20075 Args.push_back(x: Entry);
20076 }
20077
20078 SDValue InChain = DAG.getEntryNode();
20079 SDValue TCChain = InChain;
20080 const Function &F = DAG.getMachineFunction().getFunction();
20081 bool isTailCall =
20082 TLI.isInTailCallPosition(DAG, Node: Op.getNode(), Chain&: TCChain) &&
20083 (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
20084 if (isTailCall)
20085 InChain = TCChain;
20086 CLI.setDebugLoc(SDLoc(Op))
20087 .setChain(InChain)
20088 .setLibCallee(CC: CallingConv::C, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args))
20089 .setTailCall(isTailCall)
20090 .setSExtResult(SignExtend)
20091 .setZExtResult(!SignExtend)
20092 .setIsPostTypeLegalization(true);
20093 return TLI.LowerCallTo(CLI).first;
20094}
20095
20096SDValue PPCTargetLowering::lowerLibCallBasedOnType(
20097 const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,
20098 SelectionDAG &DAG) const {
20099 if (Op.getValueType() == MVT::f32)
20100 return lowerToLibCall(LibCallName: LibCallFloatName, Op, DAG);
20101
20102 if (Op.getValueType() == MVT::f64)
20103 return lowerToLibCall(LibCallName: LibCallDoubleName, Op, DAG);
20104
20105 return SDValue();
20106}
20107
20108bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {
20109 SDNodeFlags Flags = Op.getNode()->getFlags();
20110 return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&
20111 Flags.hasNoNaNs() && Flags.hasNoInfs();
20112}
20113
20114bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
20115 return Op.getNode()->getFlags().hasApproximateFuncs();
20116}
20117
20118bool PPCTargetLowering::isScalarMASSConversionEnabled() const {
20119 return getTargetMachine().Options.PPCGenScalarMASSEntries;
20120}
20121
20122SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
20123 const char *LibCallFloatName,
20124 const char *LibCallDoubleNameFinite,
20125 const char *LibCallFloatNameFinite,
20126 SDValue Op,
20127 SelectionDAG &DAG) const {
20128 if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))
20129 return SDValue();
20130
20131 if (!isLowringToMASSFiniteSafe(Op))
20132 return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,
20133 DAG);
20134
20135 return lowerLibCallBasedOnType(LibCallFloatName: LibCallFloatNameFinite,
20136 LibCallDoubleName: LibCallDoubleNameFinite, Op, DAG);
20137}
20138
20139SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {
20140 return lowerLibCallBase(LibCallDoubleName: "__xl_pow", LibCallFloatName: "__xl_powf", LibCallDoubleNameFinite: "__xl_pow_finite",
20141 LibCallFloatNameFinite: "__xl_powf_finite", Op, DAG);
20142}
20143
20144SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {
20145 return lowerLibCallBase(LibCallDoubleName: "__xl_sin", LibCallFloatName: "__xl_sinf", LibCallDoubleNameFinite: "__xl_sin_finite",
20146 LibCallFloatNameFinite: "__xl_sinf_finite", Op, DAG);
20147}
20148
20149SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {
20150 return lowerLibCallBase(LibCallDoubleName: "__xl_cos", LibCallFloatName: "__xl_cosf", LibCallDoubleNameFinite: "__xl_cos_finite",
20151 LibCallFloatNameFinite: "__xl_cosf_finite", Op, DAG);
20152}
20153
20154SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {
20155 return lowerLibCallBase(LibCallDoubleName: "__xl_log", LibCallFloatName: "__xl_logf", LibCallDoubleNameFinite: "__xl_log_finite",
20156 LibCallFloatNameFinite: "__xl_logf_finite", Op, DAG);
20157}
20158
20159SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {
20160 return lowerLibCallBase(LibCallDoubleName: "__xl_log10", LibCallFloatName: "__xl_log10f", LibCallDoubleNameFinite: "__xl_log10_finite",
20161 LibCallFloatNameFinite: "__xl_log10f_finite", Op, DAG);
20162}
20163
20164SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {
20165 return lowerLibCallBase(LibCallDoubleName: "__xl_exp", LibCallFloatName: "__xl_expf", LibCallDoubleNameFinite: "__xl_exp_finite",
20166 LibCallFloatNameFinite: "__xl_expf_finite", Op, DAG);
20167}
20168
20169// If we happen to match to an aligned D-Form, check if the Frame Index is
20170// adequately aligned. If it is not, reset the mode to match to X-Form.
20171static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
20172 PPC::AddrMode &Mode) {
20173 if (!isa<FrameIndexSDNode>(Val: N))
20174 return;
20175 if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||
20176 (Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))
20177 Mode = PPC::AM_XForm;
20178}
20179
20180/// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
20181/// compute the address flags of the node, get the optimal address mode based
20182/// on the flags, and set the Base and Disp based on the address mode.
20183PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
20184 SDValue N, SDValue &Disp,
20185 SDValue &Base,
20186 SelectionDAG &DAG,
20187 MaybeAlign Align) const {
20188 SDLoc DL(Parent);
20189
20190 // Compute the address flags.
20191 unsigned Flags = computeMOFlags(Parent, N, DAG);
20192
20193 // Get the optimal address mode based on the Flags.
20194 PPC::AddrMode Mode = getAddrModeForFlags(Flags);
20195
20196 // If the address mode is DS-Form or DQ-Form, check if the FI is aligned.
20197 // Select an X-Form load if it is not.
20198 setXFormForUnalignedFI(N, Flags, Mode);
20199
20200 // Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.
20201 if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {
20202 assert(Subtarget.isUsingPCRelativeCalls() &&
20203 "Must be using PC-Relative calls when a valid PC-Relative node is "
20204 "present!");
20205 Mode = PPC::AM_PCRel;
20206 }
20207
20208 // Set Base and Disp accordingly depending on the address mode.
20209 switch (Mode) {
20210 case PPC::AM_DForm:
20211 case PPC::AM_DSForm:
20212 case PPC::AM_DQForm: {
20213 // This is a register plus a 16-bit immediate. The base will be the
20214 // register and the displacement will be the immediate unless it
20215 // isn't sufficiently aligned.
20216 if (Flags & PPC::MOF_RPlusSImm16) {
20217 SDValue Op0 = N.getOperand(i: 0);
20218 SDValue Op1 = N.getOperand(i: 1);
20219 int16_t Imm = Op1->getAsZExtVal();
20220 if (!Align || isAligned(Lhs: *Align, SizeInBytes: Imm)) {
20221 Disp = DAG.getSignedTargetConstant(Val: Imm, DL, VT: N.getValueType());
20222 Base = Op0;
20223 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: Op0)) {
20224 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20225 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
20226 }
20227 break;
20228 }
20229 }
20230 // This is a register plus the @lo relocation. The base is the register
20231 // and the displacement is the global address.
20232 else if (Flags & PPC::MOF_RPlusLo) {
20233 Disp = N.getOperand(i: 1).getOperand(i: 0); // The global address.
20234 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
20235 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
20236 Disp.getOpcode() == ISD::TargetConstantPool ||
20237 Disp.getOpcode() == ISD::TargetJumpTable);
20238 Base = N.getOperand(i: 0);
20239 break;
20240 }
20241 // This is a constant address at most 32 bits. The base will be
20242 // zero or load-immediate-shifted and the displacement will be
20243 // the low 16 bits of the address.
20244 else if (Flags & PPC::MOF_AddrIsSImm32) {
20245 auto *CN = cast<ConstantSDNode>(Val&: N);
20246 EVT CNType = CN->getValueType(ResNo: 0);
20247 uint64_t CNImm = CN->getZExtValue();
20248 // If this address fits entirely in a 16-bit sext immediate field, codegen
20249 // this as "d, 0".
20250 int16_t Imm;
20251 if (isIntS16Immediate(N: CN, Imm) && (!Align || isAligned(Lhs: *Align, SizeInBytes: Imm))) {
20252 Disp = DAG.getSignedTargetConstant(Val: Imm, DL, VT: CNType);
20253 Base = DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20254 VT: CNType);
20255 break;
20256 }
20257 // Handle 32-bit sext immediate with LIS + Addr mode.
20258 if ((CNType == MVT::i32 || isInt<32>(x: CNImm)) &&
20259 (!Align || isAligned(Lhs: *Align, SizeInBytes: CNImm))) {
20260 int32_t Addr = (int32_t)CNImm;
20261 // Otherwise, break this down into LIS + Disp.
20262 Disp = DAG.getSignedTargetConstant(Val: (int16_t)Addr, DL, VT: MVT::i32);
20263 Base = DAG.getSignedTargetConstant(Val: (Addr - (int16_t)Addr) >> 16, DL,
20264 VT: MVT::i32);
20265 uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
20266 Base = SDValue(DAG.getMachineNode(Opcode: LIS, dl: DL, VT: CNType, Op1: Base), 0);
20267 break;
20268 }
20269 }
20270 // Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.
20271 Disp = DAG.getTargetConstant(Val: 0, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
20272 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N)) {
20273 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20274 fixupFuncForFI(DAG, FrameIdx: FI->getIndex(), VT: N.getValueType());
20275 } else
20276 Base = N;
20277 break;
20278 }
20279 case PPC::AM_PrefixDForm: {
20280 int64_t Imm34 = 0;
20281 unsigned Opcode = N.getOpcode();
20282 if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
20283 (isIntS34Immediate(Op: N.getOperand(i: 1), Imm&: Imm34))) {
20284 // N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
20285 Disp = DAG.getSignedTargetConstant(Val: Imm34, DL, VT: N.getValueType());
20286 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: N.getOperand(i: 0)))
20287 Base = DAG.getTargetFrameIndex(FI: FI->getIndex(), VT: N.getValueType());
20288 else
20289 Base = N.getOperand(i: 0);
20290 } else if (isIntS34Immediate(Op: N, Imm&: Imm34)) {
20291 // The address is a 34-bit signed immediate.
20292 Disp = DAG.getSignedTargetConstant(Val: Imm34, DL, VT: N.getValueType());
20293 Base = DAG.getRegister(Reg: PPC::ZERO8, VT: N.getValueType());
20294 }
20295 break;
20296 }
20297 case PPC::AM_PCRel: {
20298 // When selecting PC-Relative instructions, "Base" is not utilized as
20299 // we select the address as [PC+imm].
20300 Disp = N;
20301 break;
20302 }
20303 case PPC::AM_None:
20304 break;
20305 default: { // By default, X-Form is always available to be selected.
20306 // When a frame index is not aligned, we also match by XForm.
20307 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
20308 Base = FI ? N : N.getOperand(i: 1);
20309 Disp = FI ? DAG.getRegister(Reg: Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
20310 VT: N.getValueType())
20311 : N.getOperand(i: 0);
20312 break;
20313 }
20314 }
20315 return Mode;
20316}
20317
20318CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
20319 bool Return,
20320 bool IsVarArg) const {
20321 switch (CC) {
20322 case CallingConv::Cold:
20323 return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF);
20324 default:
20325 return CC_PPC64_ELF;
20326 }
20327}
20328
20329bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
20330 return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics();
20331}
20332
20333TargetLowering::AtomicExpansionKind
20334PPCTargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const {
20335 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20336 if (shouldInlineQuadwordAtomics() && Size == 128)
20337 return AtomicExpansionKind::MaskedIntrinsic;
20338
20339 switch (AI->getOperation()) {
20340 case AtomicRMWInst::UIncWrap:
20341 case AtomicRMWInst::UDecWrap:
20342 case AtomicRMWInst::USubCond:
20343 case AtomicRMWInst::USubSat:
20344 return AtomicExpansionKind::CmpXChg;
20345 default:
20346 return TargetLowering::shouldExpandAtomicRMWInIR(RMW: AI);
20347 }
20348
20349 llvm_unreachable("unreachable atomicrmw operation");
20350}
20351
20352TargetLowering::AtomicExpansionKind
20353PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(
20354 const AtomicCmpXchgInst *AI) const {
20355 unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
20356 if (shouldInlineQuadwordAtomics() && Size == 128)
20357 return AtomicExpansionKind::MaskedIntrinsic;
20358 return AtomicExpansionKind::LLSC;
20359}
20360
20361static Intrinsic::ID
20362getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
20363 switch (BinOp) {
20364 default:
20365 llvm_unreachable("Unexpected AtomicRMW BinOp");
20366 case AtomicRMWInst::Xchg:
20367 return Intrinsic::ppc_atomicrmw_xchg_i128;
20368 case AtomicRMWInst::Add:
20369 return Intrinsic::ppc_atomicrmw_add_i128;
20370 case AtomicRMWInst::Sub:
20371 return Intrinsic::ppc_atomicrmw_sub_i128;
20372 case AtomicRMWInst::And:
20373 return Intrinsic::ppc_atomicrmw_and_i128;
20374 case AtomicRMWInst::Or:
20375 return Intrinsic::ppc_atomicrmw_or_i128;
20376 case AtomicRMWInst::Xor:
20377 return Intrinsic::ppc_atomicrmw_xor_i128;
20378 case AtomicRMWInst::Nand:
20379 return Intrinsic::ppc_atomicrmw_nand_i128;
20380 }
20381}
20382
20383Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
20384 IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
20385 Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
20386 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
20387 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20388 Type *ValTy = Incr->getType();
20389 assert(ValTy->getPrimitiveSizeInBits() == 128);
20390 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
20391 Value *IncrLo = Builder.CreateTrunc(V: Incr, DestTy: Int64Ty, Name: "incr_lo");
20392 Value *IncrHi =
20393 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Incr, RHS: 64), DestTy: Int64Ty, Name: "incr_hi");
20394 Value *LoHi = Builder.CreateIntrinsic(
20395 ID: getIntrinsicForAtomicRMWBinOp128(BinOp: AI->getOperation()), Types: {},
20396 Args: {AlignedAddr, IncrLo, IncrHi});
20397 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
20398 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
20399 Lo = Builder.CreateZExt(V: Lo, DestTy: ValTy, Name: "lo64");
20400 Hi = Builder.CreateZExt(V: Hi, DestTy: ValTy, Name: "hi64");
20401 return Builder.CreateOr(
20402 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValTy, V: 64)), Name: "val64");
20403}
20404
20405Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
20406 IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
20407 Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
20408 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
20409 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20410 Type *ValTy = CmpVal->getType();
20411 assert(ValTy->getPrimitiveSizeInBits() == 128);
20412 Function *IntCmpXchg =
20413 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::ppc_cmpxchg_i128);
20414 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
20415 Value *CmpLo = Builder.CreateTrunc(V: CmpVal, DestTy: Int64Ty, Name: "cmp_lo");
20416 Value *CmpHi =
20417 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: CmpVal, RHS: 64), DestTy: Int64Ty, Name: "cmp_hi");
20418 Value *NewLo = Builder.CreateTrunc(V: NewVal, DestTy: Int64Ty, Name: "new_lo");
20419 Value *NewHi =
20420 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: NewVal, RHS: 64), DestTy: Int64Ty, Name: "new_hi");
20421 emitLeadingFence(Builder, Inst: CI, Ord);
20422 Value *LoHi =
20423 Builder.CreateCall(Callee: IntCmpXchg, Args: {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi});
20424 emitTrailingFence(Builder, Inst: CI, Ord);
20425 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
20426 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
20427 Lo = Builder.CreateZExt(V: Lo, DestTy: ValTy, Name: "lo64");
20428 Hi = Builder.CreateZExt(V: Hi, DestTy: ValTy, Name: "hi64");
20429 return Builder.CreateOr(
20430 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValTy, V: 64)), Name: "val64");
20431}
20432
20433bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
20434 return Subtarget.useCRBits();
20435}
20436