1//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Performs general IR level optimizations on SVE intrinsics.
10//
11// This pass performs the following optimizations:
12//
13// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
14// %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
15// %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
16// ; (%1 can be replaced with a reinterpret of %2)
17//
18// - optimizes ptest intrinsics where the operands are being needlessly
19// converted to and from svbool_t.
20//
21//===----------------------------------------------------------------------===//
22
23#include "AArch64.h"
24#include "Utils/AArch64BaseInfo.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/IR/Constants.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/IntrinsicInst.h"
32#include "llvm/IR/IntrinsicsAArch64.h"
33#include "llvm/IR/LLVMContext.h"
34#include "llvm/IR/Module.h"
35#include "llvm/IR/PatternMatch.h"
36#include "llvm/InitializePasses.h"
37#include <optional>
38
39using namespace llvm;
40using namespace llvm::PatternMatch;
41
42#define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
43
44namespace {
45struct SVEIntrinsicOpts : public ModulePass {
46 static char ID; // Pass identification, replacement for typeid
47 SVEIntrinsicOpts() : ModulePass(ID) {}
48
49 bool runOnModule(Module &M) override;
50 void getAnalysisUsage(AnalysisUsage &AU) const override;
51
52private:
53 bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
54 SmallSetVector<IntrinsicInst *, 4> &PTrues);
55 bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
56};
57} // end anonymous namespace
58
59void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
60 AU.addRequired<DominatorTreeWrapperPass>();
61 AU.setPreservesCFG();
62}
63
64char SVEIntrinsicOpts::ID = 0;
65static const char *name = "SVE intrinsics optimizations";
66INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
67INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
68INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
69
70ModulePass *llvm::createSVEIntrinsicOptsPass() {
71 return new SVEIntrinsicOpts();
72}
73
74/// Checks if a ptrue intrinsic call is promoted. The act of promoting a
75/// ptrue will introduce zeroing. For example:
76///
77/// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
78/// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
79/// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
80///
81/// %1 is promoted, because it is converted:
82///
83/// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
84///
85/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
86static bool isPTruePromoted(IntrinsicInst *PTrue) {
87 // Find all users of this intrinsic that are calls to convert-to-svbool
88 // reinterpret intrinsics.
89 SmallVector<IntrinsicInst *, 4> ConvertToUses;
90 for (User *User : PTrue->users()) {
91 if (match(V: User, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
92 ConvertToUses.push_back(Elt: cast<IntrinsicInst>(Val: User));
93 }
94 }
95
96 // If no such calls were found, this is ptrue is not promoted.
97 if (ConvertToUses.empty())
98 return false;
99
100 // Otherwise, try to find users of the convert-to-svbool intrinsics that are
101 // calls to the convert-from-svbool intrinsic, and would result in some lanes
102 // being zeroed.
103 const auto *PTrueVTy = cast<ScalableVectorType>(Val: PTrue->getType());
104 for (IntrinsicInst *ConvertToUse : ConvertToUses) {
105 for (User *User : ConvertToUse->users()) {
106 auto *IntrUser = dyn_cast<IntrinsicInst>(Val: User);
107 if (IntrUser && IntrUser->getIntrinsicID() ==
108 Intrinsic::aarch64_sve_convert_from_svbool) {
109 const auto *IntrUserVTy = cast<ScalableVectorType>(Val: IntrUser->getType());
110
111 // Would some lanes become zeroed by the conversion?
112 if (IntrUserVTy->getElementCount().getKnownMinValue() >
113 PTrueVTy->getElementCount().getKnownMinValue())
114 // This is a promoted ptrue.
115 return true;
116 }
117 }
118 }
119
120 // If no matching calls were found, this is not a promoted ptrue.
121 return false;
122}
123
124/// Attempts to coalesce ptrues in a basic block.
125bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
126 BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
127 if (PTrues.size() <= 1)
128 return false;
129
130 // Find the ptrue with the most lanes.
131 auto *MostEncompassingPTrue =
132 *llvm::max_element(Range&: PTrues, C: [](auto *PTrue1, auto *PTrue2) {
133 auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
134 auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
135 return PTrue1VTy->getElementCount().getKnownMinValue() <
136 PTrue2VTy->getElementCount().getKnownMinValue();
137 });
138
139 // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
140 // behind only the ptrues to be coalesced.
141 PTrues.remove(X: MostEncompassingPTrue);
142 PTrues.remove_if(P: isPTruePromoted);
143
144 // Hoist MostEncompassingPTrue to the start of the basic block. It is always
145 // safe to do this, since ptrue intrinsic calls are guaranteed to have no
146 // predecessors.
147 MostEncompassingPTrue->moveBefore(BB, I: BB.getFirstInsertionPt());
148
149 LLVMContext &Ctx = BB.getContext();
150 IRBuilder<> Builder(Ctx);
151 Builder.SetInsertPoint(TheBB: &BB, IP: ++MostEncompassingPTrue->getIterator());
152
153 auto *MostEncompassingPTrueVTy =
154 cast<VectorType>(Val: MostEncompassingPTrue->getType());
155 auto *ConvertToSVBool = Builder.CreateIntrinsicWithoutFolding(
156 ID: Intrinsic::aarch64_sve_convert_to_svbool, OverloadTypes: {MostEncompassingPTrueVTy},
157 Args: {MostEncompassingPTrue});
158
159 bool ConvertFromCreated = false;
160 for (auto *PTrue : PTrues) {
161 auto *PTrueVTy = cast<VectorType>(Val: PTrue->getType());
162
163 // Only create the converts if the types are not already the same, otherwise
164 // just use the most encompassing ptrue.
165 if (MostEncompassingPTrueVTy != PTrueVTy) {
166 ConvertFromCreated = true;
167
168 Builder.SetInsertPoint(TheBB: &BB, IP: ++ConvertToSVBool->getIterator());
169 auto *ConvertFromSVBool =
170 Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
171 OverloadTypes: {PTrueVTy}, Args: {ConvertToSVBool});
172 PTrue->replaceAllUsesWith(V: ConvertFromSVBool);
173 } else
174 PTrue->replaceAllUsesWith(V: MostEncompassingPTrue);
175
176 PTrue->eraseFromParent();
177 }
178
179 // We never used the ConvertTo so remove it
180 if (!ConvertFromCreated)
181 ConvertToSVBool->eraseFromParent();
182
183 return true;
184}
185
186/// The goal of this function is to remove redundant calls to the SVE ptrue
187/// intrinsic in each basic block within the given functions.
188///
189/// SVE ptrues have two representations in LLVM IR:
190/// - a logical representation -- an arbitrary-width scalable vector of i1s,
191/// i.e. <vscale x N x i1>.
192/// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
193/// scalable vector of i1s, i.e. <vscale x 16 x i1>.
194///
195/// The SVE ptrue intrinsic is used to create a logical representation of an SVE
196/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
197/// P1 creates a logical SVE predicate that is at least as wide as the logical
198/// SVE predicate created by P2, then all of the bits that are true in the
199/// physical representation of P2 are necessarily also true in the physical
200/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
201/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
202/// convert.{to,from}.svbool.
203///
204/// Currently, this pass only coalesces calls to SVE ptrue intrinsics
205/// if they match the following conditions:
206///
207/// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
208/// SV_ALL indicates that all bits of the predicate vector are to be set to
209/// true. SV_POW2 indicates that all bits of the predicate vector up to the
210/// largest power-of-two are to be set to true.
211/// - the result of the call to the intrinsic is not promoted to a wider
212/// predicate. In this case, keeping the extra ptrue leads to better codegen
213/// -- coalescing here would create an irreducible chain of SVE reinterprets
214/// via convert.{to,from}.svbool.
215///
216/// EXAMPLE:
217///
218/// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
219/// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
220/// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
221/// ...
222///
223/// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
224/// ; Logical: <1, 1, 1, 1>
225/// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
226/// ...
227///
228/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
229///
230/// %1 = <vscale x 8 x i1> ptrue(i32 i31)
231/// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
232/// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
233///
234bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
235 SmallSetVector<Function *, 4> &Functions) {
236 bool Changed = false;
237
238 for (auto *F : Functions) {
239 for (auto &BB : *F) {
240 SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
241 SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
242
243 // For each basic block, collect the used ptrues and try to coalesce them.
244 for (Instruction &I : BB) {
245 if (I.use_empty())
246 continue;
247
248 auto *IntrI = dyn_cast<IntrinsicInst>(Val: &I);
249 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
250 continue;
251
252 const auto PTruePattern =
253 cast<ConstantInt>(Val: IntrI->getOperand(i_nocapture: 0))->getZExtValue();
254
255 if (PTruePattern == AArch64SVEPredPattern::all)
256 SVAllPTrues.insert(X: IntrI);
257 if (PTruePattern == AArch64SVEPredPattern::pow2)
258 SVPow2PTrues.insert(X: IntrI);
259 }
260
261 Changed |= coalescePTrueIntrinsicCalls(BB, PTrues&: SVAllPTrues);
262 Changed |= coalescePTrueIntrinsicCalls(BB, PTrues&: SVPow2PTrues);
263 }
264 }
265
266 return Changed;
267}
268
269bool SVEIntrinsicOpts::runOnModule(Module &M) {
270 bool Changed = false;
271 SmallSetVector<Function *, 4> Functions;
272
273 // Check for SVE intrinsic declarations first so that we only iterate over
274 // relevant functions. Where an appropriate declaration is found, store the
275 // function(s) where it is used so we can target these only.
276 for (auto &F : M.getFunctionList()) {
277 if (!F.isDeclaration())
278 continue;
279
280 switch (F.getIntrinsicID()) {
281 case Intrinsic::aarch64_sve_ptrue:
282 for (User *U : F.users())
283 Functions.insert(X: cast<Instruction>(Val: U)->getFunction());
284 break;
285 default:
286 break;
287 }
288 }
289
290 if (!Functions.empty())
291 Changed |= optimizePTrueIntrinsicCalls(Functions);
292
293 return Changed;
294}
295