1//===----- RISCVCodeGenPrepare.cpp ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is a RISC-V specific version of CodeGenPrepare.
10// It munges the code in the input function to better prepare it for
11// SelectionDAG-based code generation. This works around limitations in it's
12// basic-block-at-a-time approach.
13//
14//===----------------------------------------------------------------------===//
15
16#include "RISCV.h"
17#include "RISCVTargetMachine.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/ValueTracking.h"
20#include "llvm/CodeGen/TargetPassConfig.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/InstVisitor.h"
24#include "llvm/IR/Intrinsics.h"
25#include "llvm/IR/PatternMatch.h"
26#include "llvm/InitializePasses.h"
27#include "llvm/Pass.h"
28#include "llvm/Transforms/Utils/Local.h"
29
30using namespace llvm;
31
32#define DEBUG_TYPE "riscv-codegenprepare"
33#define PASS_NAME "RISC-V CodeGenPrepare"
34
35namespace {
36class RISCVCodeGenPrepare : public InstVisitor<RISCVCodeGenPrepare, bool> {
37 Function &F;
38 const DataLayout *DL;
39 const DominatorTree *DT;
40 const RISCVSubtarget *ST;
41
42public:
43 RISCVCodeGenPrepare(Function &F, const DominatorTree *DT,
44 const RISCVSubtarget *ST)
45 : F(F), DL(&F.getDataLayout()), DT(DT), ST(ST) {}
46 bool run();
47 bool visitInstruction(Instruction &I) { return false; }
48 bool visitAnd(BinaryOperator &BO);
49 bool visitIntrinsicInst(IntrinsicInst &I);
50 bool expandVPStrideLoad(IntrinsicInst &I);
51 bool widenVPMerge(Instruction *I);
52 bool visitFreezeInst(FreezeInst &BO);
53};
54} // namespace
55
56namespace {
57class RISCVCodeGenPrepareLegacyPass : public FunctionPass {
58public:
59 static char ID;
60
61 RISCVCodeGenPrepareLegacyPass() : FunctionPass(ID) {}
62
63 bool runOnFunction(Function &F) override;
64 StringRef getPassName() const override { return PASS_NAME; }
65
66 void getAnalysisUsage(AnalysisUsage &AU) const override {
67 AU.setPreservesCFG();
68 AU.addRequired<DominatorTreeWrapperPass>();
69 AU.addRequired<TargetPassConfig>();
70 }
71};
72} // namespace
73
74// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
75// but bits 63:32 are zero. If we know that bit 31 of X is 0, we can fill
76// the upper 32 bits with ones.
77bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
78 if (!ST->is64Bit())
79 return false;
80
81 if (!BO.getType()->isIntegerTy(Bitwidth: 64))
82 return false;
83
84 using namespace PatternMatch;
85
86 // Left hand side should be a zext nneg.
87 Value *LHSSrc;
88 if (!match(V: BO.getOperand(i_nocapture: 0), P: m_NNegZExt(Op: m_Value(V&: LHSSrc))))
89 return false;
90
91 if (!LHSSrc->getType()->isIntegerTy(Bitwidth: 32))
92 return false;
93
94 // Right hand side should be a constant.
95 Value *RHS = BO.getOperand(i_nocapture: 1);
96
97 auto *CI = dyn_cast<ConstantInt>(Val: RHS);
98 if (!CI)
99 return false;
100 uint64_t C = CI->getZExtValue();
101
102 // Look for constants that fit in 32 bits but not simm12, and can be made
103 // into simm12 by sign extending bit 31. This will allow use of ANDI.
104 // TODO: Is worth making simm32?
105 if (!isUInt<32>(x: C) || isInt<12>(x: C) || !isInt<12>(x: SignExtend64<32>(x: C)))
106 return false;
107
108 // Sign extend the constant and replace the And operand.
109 C = SignExtend64<32>(x: C);
110 BO.setOperand(i_nocapture: 1, Val_nocapture: ConstantInt::get(Ty: RHS->getType(), V: C));
111
112 return true;
113}
114
115// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like
116// follows:
117//
118// loop:
119// %phi = phi <vscale x 4 x i1> [zeroinitializer, %entry], [%freeze, %loop]
120// %cmp = icmp ...
121// %rec = call <vscale x 4 x i1> @llvm.vp.merge(%cmp, i1 true, %phi, %evl)
122// %freeze = freeze <vscale x 4 x i1> %rec [optional]
123// ...
124// middle:
125// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %freeze)
126//
127// However RVV doesn't have any tail undisturbed mask instructions and so we
128// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see
129// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll.
130//
131// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will
132// generate a single vmerge.vim:
133//
134// loop:
135// %phi = phi <vscale x 4 x i8> [zeroinitializer, %entry], [%freeze, %loop]
136// %cmp = icmp ...
137// %rec = call <vscale x 4 x i8> @llvm.vp.merge(%cmp, i8 true, %phi, %evl)
138// %freeze = freeze <vscale x 4 x i8> %rec
139// %trunc = trunc <vscale x 4 x i8> %freeze to <vscale x 4 x i1>
140// ...
141// middle:
142// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %trunc)
143//
144// The trunc will normally be sunk outside of the loop, but even if there are
145// users inside the loop it is still profitable.
146bool RISCVCodeGenPrepare::widenVPMerge(Instruction *Root) {
147 if (!Root->getType()->getScalarType()->isIntegerTy(Bitwidth: 1))
148 return false;
149
150 Value *Mask, *True, *PhiV, *EVL;
151 using namespace PatternMatch;
152 auto m_VPMerge = m_Intrinsic<Intrinsic::vp_merge>(
153 Op0: m_Value(V&: Mask), Op1: m_Value(V&: True), Op2: m_Value(V&: PhiV), Op3: m_Value(V&: EVL));
154 if (!match(V: Root, P: m_CombineOr(L: m_VPMerge, R: m_Freeze(Op: m_VPMerge))))
155 return false;
156
157 auto *Phi = dyn_cast<PHINode>(Val: PhiV);
158 if (!Phi || !Phi->hasOneUse() || Phi->getNumIncomingValues() != 2 ||
159 !match(V: Phi->getIncomingValue(i: 0), P: m_Zero()) ||
160 Phi->getIncomingValue(i: 1) != Root)
161 return false;
162
163 Type *WideTy =
164 VectorType::get(ElementType: IntegerType::getInt8Ty(C&: Root->getContext()),
165 EC: cast<VectorType>(Val: Root->getType())->getElementCount());
166
167 IRBuilder<> Builder(Phi);
168 PHINode *WidePhi = Builder.CreatePHI(Ty: WideTy, NumReservedValues: 2);
169 WidePhi->addIncoming(V: ConstantAggregateZero::get(Ty: WideTy),
170 BB: Phi->getIncomingBlock(i: 0));
171 Builder.SetInsertPoint(Root);
172 Value *WideTrue = Builder.CreateZExt(V: True, DestTy: WideTy);
173 Value *WideMerge = Builder.CreateIntrinsic(ID: Intrinsic::vp_merge, Types: {WideTy},
174 Args: {Mask, WideTrue, WidePhi, EVL});
175 if (isa<FreezeInst>(Val: Root))
176 WideMerge = Builder.CreateFreeze(V: WideMerge);
177 WidePhi->addIncoming(V: WideMerge, BB: Phi->getIncomingBlock(i: 1));
178 Value *Trunc = Builder.CreateTrunc(V: WideMerge, DestTy: Root->getType());
179
180 Root->replaceAllUsesWith(V: Trunc);
181
182 // Break the cycle and delete the old chain.
183 Phi->setIncomingValue(i: 1, V: Phi->getIncomingValue(i: 0));
184 llvm::RecursivelyDeleteTriviallyDeadInstructions(V: Root);
185
186 return true;
187}
188
189bool RISCVCodeGenPrepare::visitFreezeInst(FreezeInst &I) {
190 if (auto *II = dyn_cast<IntrinsicInst>(Val: I.getOperand(i_nocapture: 0)))
191 if (II->getIntrinsicID() == Intrinsic::vp_merge)
192 return widenVPMerge(Root: &I);
193 return false;
194}
195
196// LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector
197// reduction instructions write the result in the first element of a vector
198// register. So when a reduction in a loop uses a scalar phi, we end up with
199// unnecessary scalar moves:
200//
201// loop:
202// vfmv.s.f v10, fa0
203// vfredosum.vs v8, v8, v10
204// vfmv.f.s fa0, v8
205//
206// This mainly affects ordered fadd reductions and VP reductions that have a
207// scalar start value, since other types of reduction typically use element-wise
208// vectorisation in the loop body. This tries to vectorize any scalar phis that
209// feed into these reductions:
210//
211// loop:
212// %phi = phi <float> [ ..., %entry ], [ %acc, %loop ]
213// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %phi,
214// <vscale x 2 x float> %vec)
215//
216// ->
217//
218// loop:
219// %phi = phi <vscale x 2 x float> [ ..., %entry ], [ %acc.vec, %loop ]
220// %phi.scalar = extractelement <vscale x 2 x float> %phi, i64 0
221// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %x,
222// <vscale x 2 x float> %vec)
223// %acc.vec = insertelement <vscale x 2 x float> poison, float %acc.next, i64 0
224//
225// Which eliminates the scalar -> vector -> scalar crossing during instruction
226// selection.
227bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
228 if (expandVPStrideLoad(I))
229 return true;
230
231 if (widenVPMerge(Root: &I))
232 return true;
233
234 if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
235 !isa<VPReductionIntrinsic>(Val: &I))
236 return false;
237
238 auto *PHI = dyn_cast<PHINode>(Val: I.getOperand(i_nocapture: 0));
239 if (!PHI || !PHI->hasOneUse() ||
240 !llvm::is_contained(Range: PHI->incoming_values(), Element: &I))
241 return false;
242
243 Type *VecTy = I.getOperand(i_nocapture: 1)->getType();
244 IRBuilder<> Builder(PHI);
245 auto *VecPHI = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PHI->getNumIncomingValues());
246
247 for (auto *BB : PHI->blocks()) {
248 Builder.SetInsertPoint(BB->getTerminator());
249 Value *InsertElt = Builder.CreateInsertElement(
250 VecTy, NewElt: PHI->getIncomingValueForBlock(BB), Idx: (uint64_t)0);
251 VecPHI->addIncoming(V: InsertElt, BB);
252 }
253
254 Builder.SetInsertPoint(&I);
255 I.setOperand(i_nocapture: 0, Val_nocapture: Builder.CreateExtractElement(Vec: VecPHI, Idx: (uint64_t)0));
256
257 PHI->eraseFromParent();
258
259 return true;
260}
261
262// Always expand zero strided loads so we match more .vx splat patterns, even if
263// we have +optimized-zero-stride-loads. RISCVDAGToDAGISel::Select will convert
264// it back to a strided load if it's optimized.
265bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
266 Value *BasePtr, *VL;
267
268 using namespace PatternMatch;
269 if (!match(V: &II, P: m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
270 Op0: m_Value(V&: BasePtr), Op1: m_Zero(), Op2: m_AllOnes(), Op3: m_Value(V&: VL))))
271 return false;
272
273 // If SEW>XLEN then a splat will get lowered as a zero strided load anyway, so
274 // avoid expanding here.
275 if (II.getType()->getScalarSizeInBits() > ST->getXLen())
276 return false;
277
278 if (!isKnownNonZero(V: VL, Q: {*DL, DT, nullptr, &II}))
279 return false;
280
281 auto *VTy = cast<VectorType>(Val: II.getType());
282
283 IRBuilder<> Builder(&II);
284 Type *STy = VTy->getElementType();
285 Value *Val = Builder.CreateLoad(Ty: STy, Ptr: BasePtr);
286 Value *Res = Builder.CreateIntrinsic(
287 ID: Intrinsic::vp_merge, Types: VTy,
288 Args: {II.getOperand(i_nocapture: 2), Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Val),
289 PoisonValue::get(T: VTy), VL});
290
291 II.replaceAllUsesWith(V: Res);
292 II.eraseFromParent();
293 return true;
294}
295
296bool RISCVCodeGenPrepare::run() {
297 bool MadeChange = false;
298 for (auto &BB : F)
299 for (Instruction &I : llvm::make_early_inc_range(Range&: BB))
300 MadeChange |= visit(I);
301
302 return MadeChange;
303}
304
305bool RISCVCodeGenPrepareLegacyPass::runOnFunction(Function &F) {
306 if (skipFunction(F))
307 return false;
308
309 auto &TPC = getAnalysis<TargetPassConfig>();
310 auto &TM = TPC.getTM<RISCVTargetMachine>();
311 auto ST = &TM.getSubtarget<RISCVSubtarget>(F);
312 auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
313
314 RISCVCodeGenPrepare RVCGP(F, DT, ST);
315 return RVCGP.run();
316}
317
318INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME,
319 false, false)
320INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
321INITIALIZE_PASS_END(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, false,
322 false)
323
324char RISCVCodeGenPrepareLegacyPass::ID = 0;
325
326FunctionPass *llvm::createRISCVCodeGenPrepareLegacyPass() {
327 return new RISCVCodeGenPrepareLegacyPass();
328}
329
330PreservedAnalyses RISCVCodeGenPreparePass::run(Function &F,
331 FunctionAnalysisManager &FAM) {
332 DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(IR&: F);
333 auto ST = &TM->getSubtarget<RISCVSubtarget>(F);
334 bool Changed = RISCVCodeGenPrepare(F, DT, ST).run();
335 if (!Changed)
336 return PreservedAnalyses::all();
337
338 PreservedAnalyses PA = PreservedAnalyses::none();
339 PA.preserveSet<CFGAnalyses>();
340 return PA;
341}
342