ARMParallelDSP.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMParallelDSP.cpp]

1	//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11	/// purpose of this pass is do some IR pattern matching to create ACLE
12	/// DSP intrinsics, which map on these 32-bit SIMD operations.
13	/// This pass runs only when unaligned accesses is supported/enabled.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "ARM.h"
18	#include "ARMSubtarget.h"
19	#include "llvm/ADT/SmallPtrSet.h"
20	#include "llvm/ADT/Statistic.h"
21	#include "llvm/Analysis/AliasAnalysis.h"
22	#include "llvm/Analysis/AssumptionCache.h"
23	#include "llvm/Analysis/GlobalsModRef.h"
24	#include "llvm/Analysis/LoopAccessAnalysis.h"
25	#include "llvm/Analysis/TargetLibraryInfo.h"
26	#include "llvm/CodeGen/TargetPassConfig.h"
27	#include "llvm/IR/IRBuilder.h"
28	#include "llvm/IR/Instructions.h"
29	#include "llvm/IR/IntrinsicsARM.h"
30	#include "llvm/IR/Module.h"
31	#include "llvm/IR/NoFolder.h"
32	#include "llvm/IR/PatternMatch.h"
33	#include "llvm/Pass.h"
34	#include "llvm/Support/Debug.h"
35	#include "llvm/Transforms/Scalar.h"
36	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37
38	using namespace llvm;
39	using namespace PatternMatch;
40
41	#define DEBUG_TYPE "arm-parallel-dsp"
42
43	STATISTIC(NumSMLAD , "Number of smlad instructions generated");
44
45	static cl::opt<bool>
46	DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(Val: false),
47	cl::desc ("Disable the ARM Parallel DSP pass"));
48
49	static cl::opt<unsigned>
50	NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(Val: `16`),
51	cl::desc ("Limit the number of loads analysed"));
52
53	namespace {
54	struct MulCandidate;
55	class Reduction;
56
57	using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, `8`>;
58	using MemInstList = SmallVectorImpl<LoadInst*>;
59	using MulPairList = SmallVector<std::pair<MulCandidate, MulCandidate>, `8`>;
60
61	// 'MulCandidate' holds the multiplication instructions that are candidates
62	// for parallel execution.
63	struct MulCandidate {
64	Instruction *Root;
65	Value* LHS;
66	Value* RHS;
67	bool Exchange = false;
68	bool Paired = false;
69	SmallVector<LoadInst, `2`> VecLd; // Container for loads to widen.*
70
71	MulCandidate(Instruction I, Value lhs, Value *rhs) :
72	Root(I), LHS(lhs), RHS(rhs) { }
73
74	bool HasTwoLoadInputs() const {
75	return isa<LoadInst>(Val: LHS) && isa<LoadInst>(Val: RHS);
76	}
77
78	LoadInst getBaseLoad() const* {
79	return VecLd.front();
80	}
81	};
82
83	/// Represent a sequence of multiply-accumulate operations with the aim to
84	/// perform the multiplications in parallel.
85	class Reduction {
86	Instruction Root = nullptr*;
87	Value Acc = nullptr*;
88	MulCandList Muls;
89	MulPairList MulPairs;
90	SetVector<Instruction*> Adds;
91
92	public:
93	Reduction() = delete;
94
95	Reduction (Instruction *Add) : Root(Add) { }
96
97	/// Record an Add instruction that is a part of the this reduction.
98	void InsertAdd(Instruction *I) { Adds.insert(X: I); }
99
100	/// Create MulCandidates, each rooted at a Mul instruction, that is a part
101	/// of this reduction.
102	void InsertMuls() {
103	auto GetMulOperand = [](Value V) -> Instruction {
104	if (auto *SExt = dyn_cast<SExtInst>(Val: V)) {
105	if (auto *I = dyn_cast<Instruction>(Val: SExt->getOperand(i_nocapture: `0`)))
106	if (I->getOpcode() == Instruction::Mul)
107	return I;
108	} else if (auto *I = dyn_cast<Instruction>(Val: V)) {
109	if (I->getOpcode() == Instruction::Mul)
110	return I;
111	}
112	return nullptr;
113	};
114
115	auto InsertMul = [this](Instruction *I) {
116	Value *LHS = cast<Instruction>(Val: I->getOperand(i: `0`))->getOperand(i: `0`);
117	Value *RHS = cast<Instruction>(Val: I->getOperand(i: `1`))->getOperand(i: `0`);
118	Muls.push_back(Elt: std::make_unique<MulCandidate>(args&: I, args&: LHS, args&: RHS));
119	};
120
121	for (auto *Add : Adds) {
122	if (Add == Acc)
123	continue;
124	if (auto *Mul = GetMulOperand(Add->getOperand(i: `0`)))
125	InsertMul(Mul);
126	if (auto *Mul = GetMulOperand(Add->getOperand(i: `1`)))
127	InsertMul(Mul);
128	}
129	}
130
131	/// Add the incoming accumulator value, returns true if a value had not
132	/// already been added. Returning false signals to the user that this
133	/// reduction already has a value to initialise the accumulator.
134	bool InsertAcc(Value *V) {
135	if (Acc)
136	return false;
137	Acc = V;
138	return true;
139	}
140
141	/// Set two MulCandidates, rooted at muls, that can be executed as a single
142	/// parallel operation.
143	void AddMulPair(MulCandidate Mul0, MulCandidate Mul1,
144	bool Exchange = false) {
145	LLVM_DEBUG(dbgs() << "Pairing:\n"
146	<< *Mul0->Root << "\n"
147	<< *Mul1->Root << "\n");
148	Mul0->Paired = true;
149	Mul1->Paired = true;
150	if (Exchange)
151	Mul1->Exchange = true;
152	MulPairs.push_back(Elt: std::make_pair(x&: Mul0, y&: Mul1));
153	}
154
155	/// Return the add instruction which is the root of the reduction.
156	Instruction getRoot() { return* Root; }
157
158	bool is64Bit() const { return Root->getType()->isIntegerTy(Bitwidth: `64`); }
159
160	Type getType() const* { return Root->getType(); }
161
162	/// Return the incoming value to be accumulated. This maybe null.
163	Value getAccumulator() { return* Acc; }
164
165	/// Return the set of adds that comprise the reduction.
166	SetVector<Instruction> &getAdds() { return* Adds; }
167
168	/// Return the MulCandidate, rooted at mul instruction, that comprise the
169	/// the reduction.
170	MulCandList &getMuls() { return Muls; }
171
172	/// Return the MulCandidate, rooted at mul instructions, that have been
173	/// paired for parallel execution.
174	MulPairList &getMulPairs() { return MulPairs; }
175
176	/// To finalise, replace the uses of the root with the intrinsic call.
177	void UpdateRoot(Instruction *SMLAD) {
178	Root->replaceAllUsesWith(V: SMLAD);
179	}
180
181	void dump() {
182	LLVM_DEBUG(dbgs() << "Reduction:\n";
183	for (auto *Add : Adds)
184	LLVM_DEBUG(dbgs() << *Add << "\n");
185	for (auto &Mul : Muls)
186	LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
187	<< " " << *Mul->LHS << "\n"
188	<< " " << *Mul->RHS << "\n");
189	LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
190	);
191	}
192	};
193
194	class WidenedLoad {
195	LoadInst NewLd = nullptr*;
196	SmallVector<LoadInst*, `4`> Loads;
197
198	public:
199	WidenedLoad(SmallVectorImpl<LoadInst> &Lds, LoadInst Wide)
200	: NewLd(Wide) {
201	append_range(C&: Loads, R&: Lds);
202	}
203	LoadInst *getLoad() {
204	return NewLd;
205	}
206	};
207
208	class ARMParallelDSP : public FunctionPass {
209	ScalarEvolution *SE;
210	AliasAnalysis *AA;
211	TargetLibraryInfo *TLI;
212	DominatorTree *DT;
213	const DataLayout *DL;
214	Module *M;
215	std::map<LoadInst, LoadInst> LoadPairs;
216	SmallPtrSet<LoadInst*, `4`> OffsetLoads;
217	std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
218
219	template<unsigned>
220	bool IsNarrowSequence(Value *V);
221	bool Search(Value V, BasicBlock BB, Reduction &R);
222	bool RecordMemoryOps(BasicBlock *BB);
223	void InsertParallelMACs(Reduction &Reduction);
224	bool AreSequentialLoads(LoadInst Ld0, LoadInst Ld1, MemInstList &VecMem);
225	LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
226	bool CreateParallelPairs(Reduction &R);
227
228	/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
229	/// Dual performs two signed 16x16-bit multiplications. It adds the
230	/// products to a 32-bit accumulate operand. Optionally, the instruction can
231	/// exchange the halfwords of the second operand before performing the
232	/// arithmetic.
233	bool MatchSMLAD(Function &F);
234
235	public:
236	static char ID;
237
238	ARMParallelDSP() : FunctionPass (ID) { }
239
240	void getAnalysisUsage(AnalysisUsage &AU) const override {
241	FunctionPass::getAnalysisUsage(AU);
242	AU.addRequired<AssumptionCacheTracker>();
243	AU.addRequired<ScalarEvolutionWrapperPass>();
244	AU.addRequired<AAResultsWrapperPass>();
245	AU.addRequired<TargetLibraryInfoWrapperPass>();
246	AU.addRequired<DominatorTreeWrapperPass>();
247	AU.addRequired<TargetPassConfig>();
248	AU.addPreserved<ScalarEvolutionWrapperPass>();
249	AU.addPreserved<GlobalsAAWrapperPass>();
250	AU.setPreservesCFG();
251	}
252
253	bool runOnFunction(Function &F) override {
254	if (DisableParallelDSP)
255	return false;
256	if (skipFunction(F))
257	return false;
258
259	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
260	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
261	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
262	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
263	auto &TPC = getAnalysis<TargetPassConfig>();
264
265	M = F.getParent();
266	DL = &M->getDataLayout();
267
268	auto &TM = TPC.getTM<TargetMachine>();
269	auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
270
271	if (!ST->allowsUnalignedMem()) {
272	LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
273	"running pass ARMParallelDSP\n");
274	return false;
275	}
276
277	if (!ST->hasDSP()) {
278	LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
279	"ARMParallelDSP\n");
280	return false;
281	}
282
283	if (!ST->isLittle()) {
284	LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
285	<< "ARMParallelDSP\n");
286	return false;
287	}
288
289	LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
290	LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
291
292	bool Changes = MatchSMLAD(F);
293	return Changes;
294	}
295	};
296	}
297
298	bool ARMParallelDSP::AreSequentialLoads(LoadInst Ld0, LoadInst Ld1,
299	MemInstList &VecMem) {
300	if (!Ld0 \|\| !Ld1)
301	return false;
302
303	auto It = LoadPairs.find(x: Ld0);
304	if (It == LoadPairs.end() \|\| It ->second != Ld1)
305	return false;
306
307	LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
308	dbgs() << "Ld0:"; Ld0->dump();
309	dbgs() << "Ld1:"; Ld1->dump();
310	);
311
312	VecMem.clear();
313	VecMem.push_back(Elt: Ld0);
314	VecMem.push_back(Elt: Ld1);
315	return true;
316	}
317
318	// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
319	// instructions, which is set to 16. So here we should collect all i8 and i16
320	// narrow operations.
321	// TODO: we currently only collect i16, and will support i8 later, so that's
322	// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
323	template<unsigned MaxBitWidth>
324	bool ARMParallelDSP::IsNarrowSequence(Value *V) {
325	if (auto *SExt = dyn_cast<SExtInst>(Val: V)) {
326	if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
327	return false;
328
329	if (auto *Ld = dyn_cast<LoadInst>(Val: SExt->getOperand(i_nocapture: `0`))) {
330	// Check that this load could be paired.
331	return LoadPairs.count(x: Ld) \|\| OffsetLoads.count(Ptr: Ld);
332	}
333	}
334	return false;
335	}
336
337	/// Iterate through the block and record base, offset pairs of loads which can
338	/// be widened into a single load.
339	bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
340	SmallVector<LoadInst*, `8`> Loads;
341	SmallVector<Instruction*, `8`> Writes;
342	LoadPairs.clear();
343	WideLoads.clear();
344
345	// Collect loads and instruction that may write to memory. For now we only
346	// record loads which are simple, sign-extended and have a single user.
347	// TODO: Allow zero-extended loads.
348	for (auto &I : *BB) {
349	if (I.mayWriteToMemory())
350	Writes.push_back(Elt: &I);
351	auto *Ld = dyn_cast<LoadInst>(Val: &I);
352	if (!Ld \|\| !Ld->isSimple() \|\|
353	!Ld->hasOneUse() \|\| !isa<SExtInst>(Val: Ld->user_back()))
354	continue;
355	Loads.push_back(Elt: Ld);
356	}
357
358	if (Loads.empty() \|\| Loads.size() > NumLoadLimit)
359	return false;
360
361	using InstSet = std::set<Instruction*>;
362	using DepMap = std::map<Instruction*, InstSet>;
363	DepMap RAWDeps;
364
365	// Record any writes that may alias a load.
366	const auto Size = LocationSize::beforeOrAfterPointer();
367	for (auto *Write : Writes) {
368	for (auto *Read : Loads) {
369	MemoryLocation ReadLoc =
370	MemoryLocation (Read->getPointerOperand(), Size);
371
372	if (!isModOrRefSet(MRI: AA->getModRefInfo(I: Write, OptLoc: ReadLoc)))
373	continue;
374	if (Write->comesBefore(Other: Read))
375	RAWDeps [Read].insert(x: Write);
376	}
377	}
378
379	// Check whether there's not a write between the two loads which would
380	// prevent them from being safely merged.
381	auto SafeToPair = [&](LoadInst Base, LoadInst Offset) {
382	bool BaseFirst = Base->comesBefore(Other: Offset);
383	LoadInst *Dominator = BaseFirst ? Base : Offset;
384	LoadInst *Dominated = BaseFirst ? Offset : Base;
385
386	if (auto It = RAWDeps.find(x: Dominated); It != RAWDeps.end()) {
387	InstSet &WritesBefore = It ->second;
388
389	for (auto *Before : WritesBefore) {
390	// We can't move the second load backward, past a write, to merge
391	// with the first load.
392	if (Dominator->comesBefore(Other: Before))
393	return false;
394	}
395	}
396	return true;
397	};
398
399	// Record base, offset load pairs.
400	for (auto *Base : Loads) {
401	for (auto *Offset : Loads) {
402	if (Base == Offset \|\| OffsetLoads.count(Ptr: Offset))
403	continue;
404
405	if (isConsecutiveAccess(A: Base, B: Offset, DL: DL, SE&: SE) &&
406	SafeToPair (Base, Offset)) {
407	LoadPairs [Base] = Offset;
408	OffsetLoads.insert(Ptr: Offset);
409	break;
410	}
411	}
412	}
413
414	LLVM_DEBUG(if (!LoadPairs.empty()) {
415	dbgs() << "Consecutive load pairs:\n";
416	for (auto &MapIt : LoadPairs) {
417	LLVM_DEBUG(dbgs() << *MapIt.first << ", "
418	<< *MapIt.second << "\n");
419	}
420	});
421	return LoadPairs.size() > `1`;
422	}
423
424	// Search recursively back through the operands to find a tree of values that
425	// form a multiply-accumulate chain. The search records the Add and Mul
426	// instructions that form the reduction and allows us to find a single value
427	// to be used as the initial input to the accumlator.
428	bool ARMParallelDSP::Search(Value V, BasicBlock BB, Reduction &R) {
429	// If we find a non-instruction, try to use it as the initial accumulator
430	// value. This may have already been found during the search in which case
431	// this function will return false, signaling a search fail.
432	auto *I = dyn_cast<Instruction>(Val: V);
433	if (!I)
434	return R.InsertAcc(V);
435
436	if (I->getParent() != BB)
437	return false;
438
439	switch (I->getOpcode()) {
440	default:
441	break;
442	case Instruction::PHI:
443	// Could be the accumulator value.
444	return R.InsertAcc(V);
445	case Instruction::Add: {
446	// Adds should be adding together two muls, or another add and a mul to
447	// be within the mac chain. One of the operands may also be the
448	// accumulator value at which point we should stop searching.
449	R.InsertAdd(I);
450	Value *LHS = I->getOperand(i: `0`);
451	Value *RHS = I->getOperand(i: `1`);
452	bool ValidLHS = Search(V: LHS, BB, R);
453	bool ValidRHS = Search(V: RHS, BB, R);
454
455	if (ValidLHS && ValidRHS)
456	return true;
457
458	// Ensure we don't add the root as the incoming accumulator.
459	if (R.getRoot() == I)
460	return false;
461
462	return R.InsertAcc(V: I);
463	}
464	case Instruction::Mul: {
465	Value *MulOp0 = I->getOperand(i: `0`);
466	Value *MulOp1 = I->getOperand(i: `1`);
467	return IsNarrowSequence<`16`>(V: MulOp0) && IsNarrowSequence<`16`>(V: MulOp1);
468	}
469	case Instruction::SExt:
470	return Search(V: I->getOperand(i: `0`), BB, R);
471	}
472	return false;
473	}
474
475	// The pass needs to identify integer add/sub reductions of 16-bit vector
476	// multiplications.
477	// To use SMLAD:
478	// 1) we first need to find integer add then look for this pattern:
479	//
480	// acc0 = ...
481	// ld0 = load i16
482	// sext0 = sext i16 %ld0 to i32
483	// ld1 = load i16
484	// sext1 = sext i16 %ld1 to i32
485	// mul0 = mul %sext0, %sext1
486	// ld2 = load i16
487	// sext2 = sext i16 %ld2 to i32
488	// ld3 = load i16
489	// sext3 = sext i16 %ld3 to i32
490	// mul1 = mul i32 %sext2, %sext3
491	// add0 = add i32 %mul0, %acc0
492	// acc1 = add i32 %add0, %mul1
493	//
494	// Which can be selected to:
495	//
496	// ldr r0
497	// ldr r1
498	// smlad r2, r0, r1, r2
499	//
500	// If constants are used instead of loads, these will need to be hoisted
501	// out and into a register.
502	//
503	// If loop invariants are used instead of loads, these need to be packed
504	// before the loop begins.
505	//
506	bool ARMParallelDSP::MatchSMLAD(Function &F) {
507	bool Changed = false;
508
509	for (auto &BB : F) {
510	SmallPtrSet<Instruction*, `4`> AllAdds;
511	if (!RecordMemoryOps(BB: &BB))
512	continue;
513
514	for (Instruction &I : reverse(C&: BB)) {
515	if (I.getOpcode() != Instruction::Add)
516	continue;
517
518	if (AllAdds.count(Ptr: &I))
519	continue;
520
521	const auto *Ty = I.getType();
522	if (!Ty->isIntegerTy(Bitwidth: `32`) && !Ty->isIntegerTy(Bitwidth: `64`))
523	continue;
524
525	Reduction R(&I);
526	if (!Search(V: &I, BB: &BB, R))
527	continue;
528
529	R.InsertMuls();
530	LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
531
532	if (!CreateParallelPairs(R))
533	continue;
534
535	InsertParallelMACs(Reduction&: R);
536	Changed = true;
537	AllAdds.insert_range(R&: R.getAdds());
538	LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB);
539	}
540	}
541
542	return Changed;
543	}
544
545	bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
546
547	// Not enough mul operations to make a pair.
548	if (R.getMuls().size() < `2`)
549	return false;
550
551	// Check that the muls operate directly upon sign extended loads.
552	for (auto &MulCand : R.getMuls()) {
553	if (!MulCand ->HasTwoLoadInputs())
554	return false;
555	}
556
557	auto CanPair = [&](Reduction &R, MulCandidate PMul0, MulCandidate PMul1) {
558	// The first elements of each vector should be loads with sexts. If we
559	// find that its two pairs of consecutive loads, then these can be
560	// transformed into two wider loads and the users can be replaced with
561	// DSP intrinsics.
562	auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
563	auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
564	auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
565	auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
566
567	// Check that each mul is operating on two different loads.
568	if (Ld0 == Ld2 \|\| Ld1 == Ld3)
569	return false;
570
571	if (AreSequentialLoads(Ld0, Ld1, VecMem&: PMul0->VecLd)) {
572	if (AreSequentialLoads(Ld0: Ld2, Ld1: Ld3, VecMem&: PMul1->VecLd)) {
573	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
574	R.AddMulPair(Mul0: PMul0, Mul1: PMul1);
575	return true;
576	} else if (AreSequentialLoads(Ld0: Ld3, Ld1: Ld2, VecMem&: PMul1->VecLd)) {
577	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
578	LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
579	R.AddMulPair(Mul0: PMul0, Mul1: PMul1, Exchange: true);
580	return true;
581	}
582	} else if (AreSequentialLoads(Ld0: Ld1, Ld1: Ld0, VecMem&: PMul0->VecLd) &&
583	AreSequentialLoads(Ld0: Ld2, Ld1: Ld3, VecMem&: PMul1->VecLd)) {
584	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
585	LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
586	LLVM_DEBUG(dbgs() << " and swapping muls\n");
587	// Only the second operand can be exchanged, so swap the muls.
588	R.AddMulPair(Mul0: PMul1, Mul1: PMul0, Exchange: true);
589	return true;
590	}
591	return false;
592	};
593
594	MulCandList &Muls = R.getMuls();
595	const unsigned Elems = Muls.size();
596	for (unsigned i = `0`; i < Elems; ++i) {
597	MulCandidate PMul0 = static_cast<MulCandidate>(Muls [i].get());
598	if (PMul0->Paired)
599	continue;
600
601	for (unsigned j = `0`; j < Elems; ++j) {
602	if (i == j)
603	continue;
604
605	MulCandidate PMul1 = static_cast<MulCandidate>(Muls [j].get());
606	if (PMul1->Paired)
607	continue;
608
609	const Instruction *Mul0 = PMul0->Root;
610	const Instruction *Mul1 = PMul1->Root;
611	if (Mul0 == Mul1)
612	continue;
613
614	assert(PMul0 != PMul1 && "expected different chains");
615
616	if (CanPair (R, PMul0, PMul1))
617	break;
618	}
619	}
620	return !R.getMulPairs().empty();
621	}
622
623	void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
624
625	auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
626	Value Acc, bool* Exchange,
627	Instruction *InsertAfter) {
628	// Replace the reduction chain with an intrinsic call
629
630	Value* Args[] = { WideLd0, WideLd1, Acc };
631	Function SMLAD = nullptr*;
632	if (Exchange)
633	SMLAD =
634	Acc->getType()->isIntegerTy(Bitwidth: `32`)
635	? Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::arm_smladx)
636	: Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::arm_smlaldx);
637	else
638	SMLAD = Acc->getType()->isIntegerTy(Bitwidth: `32`)
639	? Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::arm_smlad)
640	: Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::arm_smlald);
641
642	IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
643	BasicBlock::iterator (InsertAfter));
644	Instruction *Call = Builder.CreateCall(Callee: SMLAD, Args);
645	NumSMLAD ++;
646	return Call;
647	};
648
649	// Return the instruction after the dominated instruction.
650	auto GetInsertPoint = [this](Value A, Value B) {
651	assert((isa<Instruction>(A) \|\| isa<Instruction>(B)) &&
652	"expected at least one instruction");
653
654	Value V = nullptr*;
655	if (!isa<Instruction>(Val: A))
656	V = B;
657	else if (!isa<Instruction>(Val: B))
658	V = A;
659	else
660	V = DT->dominates(Def: cast<Instruction>(Val: A), User: cast<Instruction>(Val: B)) ? B : A;
661
662	return &*++BasicBlock::iterator (cast<Instruction>(Val: V));
663	};
664
665	Value *Acc = R.getAccumulator();
666
667	// For any muls that were discovered but not paired, accumulate their values
668	// as before.
669	IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
670	MulCandList &MulCands = R.getMuls();
671	for (auto &MulCand : MulCands) {
672	if (MulCand ->Paired)
673	continue;
674
675	Instruction *Mul = cast<Instruction>(Val: MulCand ->Root);
676	LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
677
678	if (R.getType() != Mul->getType()) {
679	assert(R.is64Bit() && "expected 64-bit result");
680	Builder.SetInsertPoint(&*++BasicBlock::iterator (Mul));
681	Mul = cast<Instruction>(Val: Builder.CreateSExt(V: Mul, DestTy: R.getRoot()->getType()));
682	}
683
684	if (!Acc) {
685	Acc = Mul;
686	continue;
687	}
688
689	// If Acc is the original incoming value to the reduction, it could be a
690	// phi. But the phi will dominate Mul, meaning that Mul will be the
691	// insertion point.
692	Builder.SetInsertPoint(GetInsertPoint (Mul, Acc));
693	Acc = Builder.CreateAdd(LHS: Mul, RHS: Acc);
694	}
695
696	if (!Acc) {
697	Acc = R.is64Bit() ?
698	ConstantInt::get(Ty: IntegerType::get(C&: M->getContext(), NumBits: `64`), V: `0`) :
699	ConstantInt::get(Ty: IntegerType::get(C&: M->getContext(), NumBits: `32`), V: `0`);
700	} else if (Acc->getType() != R.getType()) {
701	Builder.SetInsertPoint(R.getRoot());
702	Acc = Builder.CreateSExt(V: Acc, DestTy: R.getType());
703	}
704
705	// Roughly sort the mul pairs in their program order.
706	llvm::sort(C&: R.getMulPairs(), Comp: [](auto &PairA, auto &PairB) {
707	const Instruction *A = PairA.first->Root;
708	const Instruction *B = PairB.first->Root;
709	return A->comesBefore(Other: B);
710	});
711
712	IntegerType *Ty = IntegerType::get(C&: M->getContext(), NumBits: `32`);
713	for (auto &Pair : R.getMulPairs()) {
714	MulCandidate *LHSMul = Pair.first;
715	MulCandidate *RHSMul = Pair.second;
716	LoadInst *BaseLHS = LHSMul->getBaseLoad();
717	LoadInst *BaseRHS = RHSMul->getBaseLoad();
718	auto LIt = WideLoads.find(x: BaseLHS);
719	LoadInst *WideLHS = LIt != WideLoads.end()
720	? LIt ->second ->getLoad()
721	: CreateWideLoad(Loads&: LHSMul->VecLd, LoadTy: Ty);
722	auto RIt = WideLoads.find(x: BaseRHS);
723	LoadInst *WideRHS = RIt != WideLoads.end()
724	? RIt ->second ->getLoad()
725	: CreateWideLoad(Loads&: RHSMul->VecLd, LoadTy: Ty);
726
727	Instruction *InsertAfter = GetInsertPoint (WideLHS, WideRHS);
728	InsertAfter = GetInsertPoint (InsertAfter, Acc);
729	Acc = CreateSMLAD (WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
730	}
731	R.UpdateRoot(SMLAD: cast<Instruction>(Val: Acc));
732	}
733
734	LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
735	IntegerType *LoadTy) {
736	assert(Loads.size() == `2` && "currently only support widening two loads");
737
738	LoadInst *Base = Loads [`0`];
739	LoadInst *Offset = Loads [`1`];
740
741	Instruction *BaseSExt = dyn_cast<SExtInst>(Val: Base->user_back());
742	Instruction *OffsetSExt = dyn_cast<SExtInst>(Val: Offset->user_back());
743
744	assert((BaseSExt && OffsetSExt)
745	&& "Loads should have a single, extending, user");
746
747	std::function<void(Value, Value)> MoveBefore =
748	[&](Value A, Value B) -> void {
749	if (!isa<Instruction>(Val: A) \|\| !isa<Instruction>(Val: B))
750	return;
751
752	auto *Source = cast<Instruction>(Val: A);
753	auto *Sink = cast<Instruction>(Val: B);
754
755	if (DT->dominates(Def: Source, User: Sink) \|\|
756	Source->getParent() != Sink->getParent() \|\|
757	isa<PHINode>(Val: Source) \|\| isa<PHINode>(Val: Sink))
758	return;
759
760	Source->moveBefore(InsertPos: Sink->getIterator());
761	for (auto &Op : Source->operands())
762	MoveBefore (Op, Source);
763	};
764
765	// Insert the load at the point of the original dominating load.
766	LoadInst *DomLoad = DT->dominates(Def: Base, User: Offset) ? Base : Offset;
767	IRBuilder<NoFolder> IRB(DomLoad->getParent(),
768	++BasicBlock::iterator (DomLoad));
769
770	// Create the wide load, while making sure to maintain the original alignment
771	// as this prevents ldrd from being generated when it could be illegal due to
772	// memory alignment.
773	Value *VecPtr = Base->getPointerOperand();
774	LoadInst *WideLoad = IRB.CreateAlignedLoad(Ty: LoadTy, Ptr: VecPtr, Align: Base->getAlign());
775
776	// Make sure everything is in the correct order in the basic block.
777	MoveBefore (Base->getPointerOperand(), VecPtr);
778	MoveBefore (VecPtr, WideLoad);
779
780	// From the wide load, create two values that equal the original two loads.
781	// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
782	// TODO: Support big-endian as well.
783	Value *Bottom = IRB.CreateTrunc(V: WideLoad, DestTy: Base->getType());
784	Value *NewBaseSExt = IRB.CreateSExt(V: Bottom, DestTy: BaseSExt->getType());
785	BaseSExt->replaceAllUsesWith(V: NewBaseSExt);
786
787	IntegerType *OffsetTy = cast<IntegerType>(Val: Offset->getType());
788	Value *ShiftVal = ConstantInt::get(Ty: LoadTy, V: OffsetTy->getBitWidth());
789	Value *Top = IRB.CreateLShr(LHS: WideLoad, RHS: ShiftVal);
790	Value *Trunc = IRB.CreateTrunc(V: Top, DestTy: OffsetTy);
791	Value *NewOffsetSExt = IRB.CreateSExt(V: Trunc, DestTy: OffsetSExt->getType());
792	OffsetSExt->replaceAllUsesWith(V: NewOffsetSExt);
793
794	LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
795	<< Base << "\n" << Offset << "\n"
796	<< "Created Wide Load:\n"
797	<< *WideLoad << "\n"
798	<< *Bottom << "\n"
799	<< *NewBaseSExt << "\n"
800	<< *Top << "\n"
801	<< *Trunc << "\n"
802	<< *NewOffsetSExt << "\n");
803	WideLoads.emplace(args: std::make_pair(x&: Base,
804	y: std::make_unique<WidenedLoad>(args&: Loads, args&: WideLoad)));
805	return WideLoad;
806	}
807
808	Pass *llvm::createARMParallelDSPPass() {
809	return new ARMParallelDSP ();
810	}
811
812	char ARMParallelDSP::ID = `0`;
813
814	INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
815	"Transform functions to use DSP intrinsics", false, false)
816	INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
817	"Transform functions to use DSP intrinsics", false, false)
818

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMParallelDSP.cpp