1 | //===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass implements IR expansion for reduction intrinsics, allowing targets |
10 | // to enable the intrinsics until just before codegen. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/CodeGen/ExpandReductions.h" |
15 | #include "llvm/Analysis/TargetTransformInfo.h" |
16 | #include "llvm/CodeGen/Passes.h" |
17 | #include "llvm/IR/IRBuilder.h" |
18 | #include "llvm/IR/InstIterator.h" |
19 | #include "llvm/IR/IntrinsicInst.h" |
20 | #include "llvm/IR/Intrinsics.h" |
21 | #include "llvm/InitializePasses.h" |
22 | #include "llvm/Pass.h" |
23 | #include "llvm/Transforms/Utils/LoopUtils.h" |
24 | |
25 | using namespace llvm; |
26 | |
27 | namespace { |
28 | |
29 | bool expandReductions(Function &F, const TargetTransformInfo *TTI) { |
30 | bool Changed = false; |
31 | SmallVector<IntrinsicInst *, 4> Worklist; |
32 | for (auto &I : instructions(F)) { |
33 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
34 | switch (II->getIntrinsicID()) { |
35 | default: break; |
36 | case Intrinsic::vector_reduce_fadd: |
37 | case Intrinsic::vector_reduce_fmul: |
38 | case Intrinsic::vector_reduce_add: |
39 | case Intrinsic::vector_reduce_mul: |
40 | case Intrinsic::vector_reduce_and: |
41 | case Intrinsic::vector_reduce_or: |
42 | case Intrinsic::vector_reduce_xor: |
43 | case Intrinsic::vector_reduce_smax: |
44 | case Intrinsic::vector_reduce_smin: |
45 | case Intrinsic::vector_reduce_umax: |
46 | case Intrinsic::vector_reduce_umin: |
47 | case Intrinsic::vector_reduce_fmax: |
48 | case Intrinsic::vector_reduce_fmin: |
49 | if (TTI->shouldExpandReduction(II)) |
50 | Worklist.push_back(Elt: II); |
51 | |
52 | break; |
53 | } |
54 | } |
55 | } |
56 | |
57 | for (auto *II : Worklist) { |
58 | FastMathFlags FMF = |
59 | isa<FPMathOperator>(Val: II) ? II->getFastMathFlags() : FastMathFlags{}; |
60 | Intrinsic::ID ID = II->getIntrinsicID(); |
61 | RecurKind RK = getMinMaxReductionRecurKind(RdxID: ID); |
62 | TargetTransformInfo::ReductionShuffle RS = |
63 | TTI->getPreferredExpandedReductionShuffle(II); |
64 | |
65 | Value *Rdx = nullptr; |
66 | IRBuilder<> Builder(II); |
67 | IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); |
68 | Builder.setFastMathFlags(FMF); |
69 | switch (ID) { |
70 | default: llvm_unreachable("Unexpected intrinsic!" ); |
71 | case Intrinsic::vector_reduce_fadd: |
72 | case Intrinsic::vector_reduce_fmul: { |
73 | // FMFs must be attached to the call, otherwise it's an ordered reduction |
74 | // and it can't be handled by generating a shuffle sequence. |
75 | Value *Acc = II->getArgOperand(i: 0); |
76 | Value *Vec = II->getArgOperand(i: 1); |
77 | unsigned RdxOpcode = getArithmeticReductionInstruction(RdxID: ID); |
78 | if (!FMF.allowReassoc()) |
79 | Rdx = getOrderedReduction(Builder, Acc, Src: Vec, Op: RdxOpcode, MinMaxKind: RK); |
80 | else { |
81 | if (!isPowerOf2_32( |
82 | Value: cast<FixedVectorType>(Val: Vec->getType())->getNumElements())) |
83 | continue; |
84 | Rdx = getShuffleReduction(Builder, Src: Vec, Op: RdxOpcode, RS, MinMaxKind: RK); |
85 | Rdx = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS: Acc, RHS: Rdx, |
86 | Name: "bin.rdx" ); |
87 | } |
88 | break; |
89 | } |
90 | case Intrinsic::vector_reduce_and: |
91 | case Intrinsic::vector_reduce_or: { |
92 | // Canonicalize logical or/and reductions: |
93 | // Or reduction for i1 is represented as: |
94 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth |
95 | // %res = cmp ne iReduxWidth %val, 0 |
96 | // And reduction for i1 is represented as: |
97 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth |
98 | // %res = cmp eq iReduxWidth %val, 11111 |
99 | Value *Vec = II->getArgOperand(i: 0); |
100 | auto *FTy = cast<FixedVectorType>(Val: Vec->getType()); |
101 | unsigned NumElts = FTy->getNumElements(); |
102 | if (!isPowerOf2_32(Value: NumElts)) |
103 | continue; |
104 | |
105 | if (FTy->getElementType() == Builder.getInt1Ty()) { |
106 | Rdx = Builder.CreateBitCast(V: Vec, DestTy: Builder.getIntNTy(N: NumElts)); |
107 | if (ID == Intrinsic::vector_reduce_and) { |
108 | Rdx = Builder.CreateICmpEQ( |
109 | LHS: Rdx, RHS: ConstantInt::getAllOnesValue(Ty: Rdx->getType())); |
110 | } else { |
111 | assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction." ); |
112 | Rdx = Builder.CreateIsNotNull(Arg: Rdx); |
113 | } |
114 | break; |
115 | } |
116 | unsigned RdxOpcode = getArithmeticReductionInstruction(RdxID: ID); |
117 | Rdx = getShuffleReduction(Builder, Src: Vec, Op: RdxOpcode, RS, MinMaxKind: RK); |
118 | break; |
119 | } |
120 | case Intrinsic::vector_reduce_add: |
121 | case Intrinsic::vector_reduce_mul: |
122 | case Intrinsic::vector_reduce_xor: |
123 | case Intrinsic::vector_reduce_smax: |
124 | case Intrinsic::vector_reduce_smin: |
125 | case Intrinsic::vector_reduce_umax: |
126 | case Intrinsic::vector_reduce_umin: { |
127 | Value *Vec = II->getArgOperand(i: 0); |
128 | if (!isPowerOf2_32( |
129 | Value: cast<FixedVectorType>(Val: Vec->getType())->getNumElements())) |
130 | continue; |
131 | unsigned RdxOpcode = getArithmeticReductionInstruction(RdxID: ID); |
132 | Rdx = getShuffleReduction(Builder, Src: Vec, Op: RdxOpcode, RS, MinMaxKind: RK); |
133 | break; |
134 | } |
135 | case Intrinsic::vector_reduce_fmax: |
136 | case Intrinsic::vector_reduce_fmin: { |
137 | // We require "nnan" to use a shuffle reduction; "nsz" is implied by the |
138 | // semantics of the reduction. |
139 | Value *Vec = II->getArgOperand(i: 0); |
140 | if (!isPowerOf2_32( |
141 | Value: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()) || |
142 | !FMF.noNaNs()) |
143 | continue; |
144 | unsigned RdxOpcode = getArithmeticReductionInstruction(RdxID: ID); |
145 | Rdx = getShuffleReduction(Builder, Src: Vec, Op: RdxOpcode, RS, MinMaxKind: RK); |
146 | break; |
147 | } |
148 | } |
149 | II->replaceAllUsesWith(V: Rdx); |
150 | II->eraseFromParent(); |
151 | Changed = true; |
152 | } |
153 | return Changed; |
154 | } |
155 | |
156 | class ExpandReductions : public FunctionPass { |
157 | public: |
158 | static char ID; |
159 | ExpandReductions() : FunctionPass(ID) { |
160 | initializeExpandReductionsPass(*PassRegistry::getPassRegistry()); |
161 | } |
162 | |
163 | bool runOnFunction(Function &F) override { |
164 | const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
165 | return expandReductions(F, TTI); |
166 | } |
167 | |
168 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
169 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
170 | AU.setPreservesCFG(); |
171 | } |
172 | }; |
173 | } |
174 | |
175 | char ExpandReductions::ID; |
176 | INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions" , |
177 | "Expand reduction intrinsics" , false, false) |
178 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
179 | INITIALIZE_PASS_END(ExpandReductions, "expand-reductions" , |
180 | "Expand reduction intrinsics" , false, false) |
181 | |
182 | FunctionPass *llvm::createExpandReductionsPass() { |
183 | return new ExpandReductions(); |
184 | } |
185 | |
186 | PreservedAnalyses ExpandReductionsPass::run(Function &F, |
187 | FunctionAnalysisManager &AM) { |
188 | const auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
189 | if (!expandReductions(F, TTI: &TTI)) |
190 | return PreservedAnalyses::all(); |
191 | PreservedAnalyses PA; |
192 | PA.preserveSet<CFGAnalyses>(); |
193 | return PA; |
194 | } |
195 | |