SVEIntrinsicOpts.cpp source code [llvm_projects/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp]

1	//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Performs general IR level optimizations on SVE intrinsics.
10	//
11	// This pass performs the following optimizations:
12	//
13	// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
14	// %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
15	// %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
16	// ; (%1 can be replaced with a reinterpret of %2)
17	//
18	// - optimizes ptest intrinsics where the operands are being needlessly
19	// converted to and from svbool_t.
20	//
21	//===----------------------------------------------------------------------===//
22
23	#include "AArch64.h"
24	#include "Utils/AArch64BaseInfo.h"
25	#include "llvm/ADT/PostOrderIterator.h"
26	#include "llvm/ADT/SetVector.h"
27	#include "llvm/IR/Constants.h"
28	#include "llvm/IR/Dominators.h"
29	#include "llvm/IR/IRBuilder.h"
30	#include "llvm/IR/Instructions.h"
31	#include "llvm/IR/IntrinsicInst.h"
32	#include "llvm/IR/IntrinsicsAArch64.h"
33	#include "llvm/IR/LLVMContext.h"
34	#include "llvm/IR/Module.h"
35	#include "llvm/IR/PatternMatch.h"
36	#include "llvm/InitializePasses.h"
37	#include <optional>
38
39	using namespace llvm;
40	using namespace llvm::PatternMatch;
41
42	#define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
43
44	namespace {
45	struct SVEIntrinsicOpts : public ModulePass {
46	static char ID; // Pass identification, replacement for typeid
47	SVEIntrinsicOpts() : ModulePass (ID) {}
48
49	bool runOnModule(Module &M) override;
50	void getAnalysisUsage(AnalysisUsage &AU) const override;
51
52	private:
53	bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
54	SmallSetVector<IntrinsicInst *, `4`> &PTrues);
55	bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, `4`> &Functions);
56	bool optimizePredicateStore(Instruction *I);
57	bool optimizePredicateLoad(Instruction *I);
58
59	bool optimizeInstructions(SmallSetVector<Function *, `4`> &Functions);
60
61	/// Operates at the function-scope. I.e., optimizations are applied local to
62	/// the functions themselves.
63	bool optimizeFunctions(SmallSetVector<Function *, `4`> &Functions);
64	};
65	} // end anonymous namespace
66
67	void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
68	AU.addRequired<DominatorTreeWrapperPass>();
69	AU.setPreservesCFG();
70	}
71
72	char SVEIntrinsicOpts::ID = `0`;
73	static const char *name = "SVE intrinsics optimizations";
74	INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
75	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
76	INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
77
78	ModulePass *llvm::createSVEIntrinsicOptsPass() {
79	return new SVEIntrinsicOpts ();
80	}
81
82	/// Checks if a ptrue intrinsic call is promoted. The act of promoting a
83	/// ptrue will introduce zeroing. For example:
84	///
85	/// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
86	/// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
87	/// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
88	///
89	/// %1 is promoted, because it is converted:
90	///
91	/// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
92	///
93	/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
94	static bool isPTruePromoted(IntrinsicInst *PTrue) {
95	// Find all users of this intrinsic that are calls to convert-to-svbool
96	// reinterpret intrinsics.
97	SmallVector<IntrinsicInst *, `4`> ConvertToUses;
98	for (User *User : PTrue->users()) {
99	if (match(V: User, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
100	ConvertToUses.push_back(Elt: cast<IntrinsicInst>(Val: User));
101	}
102	}
103
104	// If no such calls were found, this is ptrue is not promoted.
105	if (ConvertToUses.empty())
106	return false;
107
108	// Otherwise, try to find users of the convert-to-svbool intrinsics that are
109	// calls to the convert-from-svbool intrinsic, and would result in some lanes
110	// being zeroed.
111	const auto *PTrueVTy = cast<ScalableVectorType>(Val: PTrue->getType());
112	for (IntrinsicInst *ConvertToUse : ConvertToUses) {
113	for (User *User : ConvertToUse->users()) {
114	auto *IntrUser = dyn_cast<IntrinsicInst>(Val: User);
115	if (IntrUser && IntrUser->getIntrinsicID() ==
116	Intrinsic::aarch64_sve_convert_from_svbool) {
117	const auto *IntrUserVTy = cast<ScalableVectorType>(Val: IntrUser->getType());
118
119	// Would some lanes become zeroed by the conversion?
120	if (IntrUserVTy->getElementCount().getKnownMinValue() >
121	PTrueVTy->getElementCount().getKnownMinValue())
122	// This is a promoted ptrue.
123	return true;
124	}
125	}
126	}
127
128	// If no matching calls were found, this is not a promoted ptrue.
129	return false;
130	}
131
132	/// Attempts to coalesce ptrues in a basic block.
133	bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
134	BasicBlock &BB, SmallSetVector<IntrinsicInst *, `4`> &PTrues) {
135	if (PTrues.size() <= `1`)
136	return false;
137
138	// Find the ptrue with the most lanes.
139	auto *MostEncompassingPTrue =
140	llvm::max_element(Range&: PTrues, C: [](auto* PTrue1, auto* *PTrue2) {
141	auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
142	auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
143	return PTrue1VTy->getElementCount().getKnownMinValue() <
144	PTrue2VTy->getElementCount().getKnownMinValue();
145	});
146
147	// Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
148	// behind only the ptrues to be coalesced.
149	PTrues.remove(X: MostEncompassingPTrue);
150	PTrues.remove_if(P: isPTruePromoted);
151
152	// Hoist MostEncompassingPTrue to the start of the basic block. It is always
153	// safe to do this, since ptrue intrinsic calls are guaranteed to have no
154	// predecessors.
155	MostEncompassingPTrue->moveBefore(BB, I: BB.getFirstInsertionPt());
156
157	LLVMContext &Ctx = BB.getContext();
158	IRBuilder<> Builder(Ctx);
159	Builder.SetInsertPoint(TheBB: &BB, IP: ++MostEncompassingPTrue->getIterator());
160
161	auto *MostEncompassingPTrueVTy =
162	cast<VectorType>(Val: MostEncompassingPTrue->getType());
163	auto *ConvertToSVBool = Builder.CreateIntrinsic(
164	ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {MostEncompassingPTrueVTy},
165	Args: {MostEncompassingPTrue});
166
167	bool ConvertFromCreated = false;
168	for (auto *PTrue : PTrues) {
169	auto *PTrueVTy = cast<VectorType>(Val: PTrue->getType());
170
171	// Only create the converts if the types are not already the same, otherwise
172	// just use the most encompassing ptrue.
173	if (MostEncompassingPTrueVTy != PTrueVTy) {
174	ConvertFromCreated = true;
175
176	Builder.SetInsertPoint(TheBB: &BB, IP: ++ConvertToSVBool->getIterator());
177	auto *ConvertFromSVBool =
178	Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
179	Types: {PTrueVTy}, Args: {ConvertToSVBool});
180	PTrue->replaceAllUsesWith(V: ConvertFromSVBool);
181	} else
182	PTrue->replaceAllUsesWith(V: MostEncompassingPTrue);
183
184	PTrue->eraseFromParent();
185	}
186
187	// We never used the ConvertTo so remove it
188	if (!ConvertFromCreated)
189	ConvertToSVBool->eraseFromParent();
190
191	return true;
192	}
193
194	/// The goal of this function is to remove redundant calls to the SVE ptrue
195	/// intrinsic in each basic block within the given functions.
196	///
197	/// SVE ptrues have two representations in LLVM IR:
198	/// - a logical representation -- an arbitrary-width scalable vector of i1s,
199	/// i.e. <vscale x N x i1>.
200	/// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
201	/// scalable vector of i1s, i.e. <vscale x 16 x i1>.
202	///
203	/// The SVE ptrue intrinsic is used to create a logical representation of an SVE
204	/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
205	/// P1 creates a logical SVE predicate that is at least as wide as the logical
206	/// SVE predicate created by P2, then all of the bits that are true in the
207	/// physical representation of P2 are necessarily also true in the physical
208	/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
209	/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
210	/// convert.{to,from}.svbool.
211	///
212	/// Currently, this pass only coalesces calls to SVE ptrue intrinsics
213	/// if they match the following conditions:
214	///
215	/// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
216	/// SV_ALL indicates that all bits of the predicate vector are to be set to
217	/// true. SV_POW2 indicates that all bits of the predicate vector up to the
218	/// largest power-of-two are to be set to true.
219	/// - the result of the call to the intrinsic is not promoted to a wider
220	/// predicate. In this case, keeping the extra ptrue leads to better codegen
221	/// -- coalescing here would create an irreducible chain of SVE reinterprets
222	/// via convert.{to,from}.svbool.
223	///
224	/// EXAMPLE:
225	///
226	/// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
227	/// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
228	/// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
229	/// ...
230	///
231	/// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
232	/// ; Logical: <1, 1, 1, 1>
233	/// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
234	/// ...
235	///
236	/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
237	///
238	/// %1 = <vscale x 8 x i1> ptrue(i32 i31)
239	/// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
240	/// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
241	///
242	bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
243	SmallSetVector<Function *, `4`> &Functions) {
244	bool Changed = false;
245
246	for (auto *F : Functions) {
247	for (auto &BB : *F) {
248	SmallSetVector<IntrinsicInst *, `4`> SVAllPTrues;
249	SmallSetVector<IntrinsicInst *, `4`> SVPow2PTrues;
250
251	// For each basic block, collect the used ptrues and try to coalesce them.
252	for (Instruction &I : BB) {
253	if (I.use_empty())
254	continue;
255
256	auto *IntrI = dyn_cast<IntrinsicInst>(Val: &I);
257	if (!IntrI \|\| IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
258	continue;
259
260	const auto PTruePattern =
261	cast<ConstantInt>(Val: IntrI->getOperand(i_nocapture: `0`))->getZExtValue();
262
263	if (PTruePattern == AArch64SVEPredPattern::all)
264	SVAllPTrues.insert(X: IntrI);
265	if (PTruePattern == AArch64SVEPredPattern::pow2)
266	SVPow2PTrues.insert(X: IntrI);
267	}
268
269	Changed \|= coalescePTrueIntrinsicCalls(BB, PTrues&: SVAllPTrues);
270	Changed \|= coalescePTrueIntrinsicCalls(BB, PTrues&: SVPow2PTrues);
271	}
272	}
273
274	return Changed;
275	}
276
277	// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
278	// scalable stores as late as possible
279	bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
280	auto *F = I->getFunction();
281	auto Attr = F->getFnAttribute(Kind: Attribute::VScaleRange);
282	if (!Attr.isValid())
283	return false;
284
285	unsigned MinVScale = Attr.getVScaleRangeMin();
286	std::optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
287	// The transform needs to know the exact runtime length of scalable vectors
288	if (!MaxVScale \|\| MinVScale != MaxVScale)
289	return false;
290
291	auto *PredType =
292	ScalableVectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), MinNumElts: `16`);
293	auto *FixedPredType =
294	FixedVectorType::get(ElementType: Type::getInt8Ty(C&: I->getContext()), NumElts: MinVScale * `2`);
295
296	// If we have a store..
297	auto *Store = dyn_cast<StoreInst>(Val: I);
298	if (!Store \|\| !Store->isSimple())
299	return false;
300
301	// ..that is storing a predicate vector sized worth of bits..
302	if (Store->getOperand(i_nocapture: `0`)->getType() != FixedPredType)
303	return false;
304
305	// ..where the value stored comes from a vector extract..
306	auto *IntrI = dyn_cast<IntrinsicInst>(Val: Store->getOperand(i_nocapture: `0`));
307	if (!IntrI \|\| IntrI->getIntrinsicID() != Intrinsic::vector_extract)
308	return false;
309
310	// ..that is extracting from index 0..
311	if (!cast<ConstantInt>(Val: IntrI->getOperand(i_nocapture: `1`))->isZero())
312	return false;
313
314	// ..where the value being extract from comes from a bitcast
315	auto *BitCast = dyn_cast<BitCastInst>(Val: IntrI->getOperand(i_nocapture: `0`));
316	if (!BitCast)
317	return false;
318
319	// ..and the bitcast is casting from predicate type
320	if (BitCast->getOperand(i_nocapture: `0`)->getType() != PredType)
321	return false;
322
323	IRBuilder<> Builder(I->getContext());
324	Builder.SetInsertPoint(I);
325
326	Builder.CreateStore(Val: BitCast->getOperand(i_nocapture: `0`), Ptr: Store->getPointerOperand());
327
328	Store->eraseFromParent();
329	if (IntrI->use_empty())
330	IntrI->eraseFromParent();
331	if (BitCast->use_empty())
332	BitCast->eraseFromParent();
333
334	return true;
335	}
336
337	// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
338	// scalable loads as late as possible
339	bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
340	auto *F = I->getFunction();
341	auto Attr = F->getFnAttribute(Kind: Attribute::VScaleRange);
342	if (!Attr.isValid())
343	return false;
344
345	unsigned MinVScale = Attr.getVScaleRangeMin();
346	std::optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
347	// The transform needs to know the exact runtime length of scalable vectors
348	if (!MaxVScale \|\| MinVScale != MaxVScale)
349	return false;
350
351	auto *PredType =
352	ScalableVectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), MinNumElts: `16`);
353	auto *FixedPredType =
354	FixedVectorType::get(ElementType: Type::getInt8Ty(C&: I->getContext()), NumElts: MinVScale * `2`);
355
356	// If we have a bitcast..
357	auto *BitCast = dyn_cast<BitCastInst>(Val: I);
358	if (!BitCast \|\| BitCast->getType() != PredType)
359	return false;
360
361	// ..whose operand is a vector_insert..
362	auto *IntrI = dyn_cast<IntrinsicInst>(Val: BitCast->getOperand(i_nocapture: `0`));
363	if (!IntrI \|\| IntrI->getIntrinsicID() != Intrinsic::vector_insert)
364	return false;
365
366	// ..that is inserting into index zero of an undef vector..
367	if (!isa<UndefValue>(Val: IntrI->getOperand(i_nocapture: `0`)) \|\|
368	!cast<ConstantInt>(Val: IntrI->getOperand(i_nocapture: `2`))->isZero())
369	return false;
370
371	// ..where the value inserted comes from a load..
372	auto *Load = dyn_cast<LoadInst>(Val: IntrI->getOperand(i_nocapture: `1`));
373	if (!Load \|\| !Load->isSimple())
374	return false;
375
376	// ..that is loading a predicate vector sized worth of bits..
377	if (Load->getType() != FixedPredType)
378	return false;
379
380	IRBuilder<> Builder(I->getContext());
381	Builder.SetInsertPoint(Load);
382
383	auto *LoadPred = Builder.CreateLoad(Ty: PredType, Ptr: Load->getPointerOperand());
384
385	BitCast->replaceAllUsesWith(V: LoadPred);
386	BitCast->eraseFromParent();
387	if (IntrI->use_empty())
388	IntrI->eraseFromParent();
389	if (Load->use_empty())
390	Load->eraseFromParent();
391
392	return true;
393	}
394
395	bool SVEIntrinsicOpts::optimizeInstructions(
396	SmallSetVector<Function *, `4`> &Functions) {
397	bool Changed = false;
398
399	for (auto *F : Functions) {
400	DominatorTree DT = &getAnalysis<DominatorTreeWrapperPass>(F&: F).getDomTree();
401
402	// Traverse the DT with an rpo walk so we see defs before uses, allowing
403	// simplification to be done incrementally.
404	BasicBlock *Root = DT->getRoot();
405	ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
406	for (auto *BB : RPOT) {
407	for (Instruction &I : make_early_inc_range(Range&: *BB)) {
408	switch (I.getOpcode()) {
409	case Instruction::Store:
410	Changed \|= optimizePredicateStore(I: &I);
411	break;
412	case Instruction::BitCast:
413	Changed \|= optimizePredicateLoad(I: &I);
414	break;
415	}
416	}
417	}
418	}
419
420	return Changed;
421	}
422
423	bool SVEIntrinsicOpts::optimizeFunctions(
424	SmallSetVector<Function *, `4`> &Functions) {
425	bool Changed = false;
426
427	Changed \|= optimizePTrueIntrinsicCalls(Functions);
428	Changed \|= optimizeInstructions(Functions);
429
430	return Changed;
431	}
432
433	bool SVEIntrinsicOpts::runOnModule(Module &M) {
434	bool Changed = false;
435	SmallSetVector<Function *, `4`> Functions;
436
437	// Check for SVE intrinsic declarations first so that we only iterate over
438	// relevant functions. Where an appropriate declaration is found, store the
439	// function(s) where it is used so we can target these only.
440	for (auto &F : M.getFunctionList()) {
441	if (!F.isDeclaration())
442	continue;
443
444	switch (F.getIntrinsicID()) {
445	case Intrinsic::vector_extract:
446	case Intrinsic::vector_insert:
447	case Intrinsic::aarch64_sve_ptrue:
448	for (User *U : F.users())
449	Functions.insert(X: cast<Instruction>(Val: U)->getFunction());
450	break;
451	default:
452	break;
453	}
454	}
455
456	if (!Functions.empty())
457	Changed \|= optimizeFunctions(Functions);
458
459	return Changed;
460	}
461

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp