1//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// This file implements SLP analysis based on VPlan. The analysis is based on
9/// the ideas described in
10///
11/// Look-ahead SLP: auto-vectorization in the presence of commutative
12/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13/// Luís F. W. Góes
14///
15//===----------------------------------------------------------------------===//
16
17#include "VPlanSLP.h"
18#include "VPlan.h"
19#include "VPlanCFG.h"
20#include "VPlanValue.h"
21#include "llvm/ADT/DenseMap.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/Analysis/LoopInfo.h"
24#include "llvm/Analysis/VectorUtils.h"
25#include "llvm/IR/Instruction.h"
26#include "llvm/IR/Instructions.h"
27#include "llvm/IR/Type.h"
28#include "llvm/IR/Value.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/raw_ostream.h"
33#include <algorithm>
34#include <cassert>
35#include <optional>
36#include <utility>
37
38using namespace llvm;
39
40#define DEBUG_TYPE "vplan-slp"
41
42// Number of levels to look ahead when re-ordering multi node operands.
43static unsigned LookaheadMaxDepth = 5;
44
45void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
46 Old2NewTy &Old2New,
47 InterleavedAccessInfo &IAI) {
48 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
49 Region->getEntry());
50 for (VPBlockBase *Base : RPOT) {
51 visitBlock(Block: Base, Old2New, IAI);
52 }
53}
54
55void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
56 InterleavedAccessInfo &IAI) {
57 if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: Block)) {
58 for (VPRecipeBase &VPI : *VPBB) {
59 if (isa<VPWidenPHIRecipe>(Val: &VPI))
60 continue;
61 auto *VPInst = dyn_cast<VPInstruction>(Val: &VPI);
62 if (!VPInst)
63 continue;
64 auto *Inst = dyn_cast_or_null<Instruction>(Val: VPInst->getUnderlyingValue());
65 if (!Inst)
66 continue;
67 auto *IG = IAI.getInterleaveGroup(Instr: Inst);
68 if (!IG)
69 continue;
70
71 auto NewIGIter = Old2New.find(Val: IG);
72 if (NewIGIter == Old2New.end())
73 Old2New[IG] = new InterleaveGroup<VPInstruction>(
74 IG->getFactor(), IG->isReverse(), IG->getAlign());
75
76 if (Inst == IG->getInsertPos())
77 Old2New[IG]->setInsertPos(VPInst);
78
79 InterleaveGroupMap[VPInst] = Old2New[IG];
80 InterleaveGroupMap[VPInst]->insertMember(
81 Instr: VPInst, Index: IG->getIndex(Instr: Inst),
82 NewAlign: Align(IG->isReverse() ? (-1) * int(IG->getFactor())
83 : IG->getFactor()));
84 }
85 } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Val: Block)) {
86 visitRegion(Region, Old2New, IAI);
87 } else {
88 llvm_unreachable("Unsupported kind of VPBlock.");
89 }
90}
91
92VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
93 InterleavedAccessInfo &IAI) {
94 Old2NewTy Old2New;
95 visitRegion(Region: Plan.getVectorLoopRegion(), Old2New, IAI);
96}
97
98VPInstruction *VPlanSlp::markFailed() {
99 // FIXME: Currently this is used to signal we hit instructions we cannot
100 // trivially SLP'ize.
101 CompletelySLP = false;
102 return nullptr;
103}
104
105void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
106 if (all_of(Range&: Operands, P: [](VPValue *V) {
107 return cast<VPInstruction>(Val: V)->getUnderlyingInstr();
108 })) {
109 unsigned BundleSize = 0;
110 for (VPValue *V : Operands) {
111 Type *T = cast<VPInstruction>(Val: V)->getUnderlyingInstr()->getType();
112 assert(!T->isVectorTy() && "Only scalar types supported for now");
113 BundleSize += T->getScalarSizeInBits();
114 }
115 WidestBundleBits = std::max(a: WidestBundleBits, b: BundleSize);
116 }
117
118 auto Res = BundleToCombined.try_emplace(Key: to_vector<4>(Range&: Operands), Args&: New);
119 assert(Res.second &&
120 "Already created a combined instruction for the operand bundle");
121 (void)Res;
122}
123
124bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
125 // Currently we only support VPInstructions.
126 if (!all_of(Range&: Operands, P: [](VPValue *Op) {
127 return Op && isa<VPInstruction>(Val: Op) &&
128 cast<VPInstruction>(Val: Op)->getUnderlyingInstr();
129 })) {
130 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
131 return false;
132 }
133
134 // Check if opcodes and type width agree for all instructions in the bundle.
135 // FIXME: Differing widths/opcodes can be handled by inserting additional
136 // instructions.
137 // FIXME: Deal with non-primitive types.
138 const Instruction *OriginalInstr =
139 cast<VPInstruction>(Val: Operands[0])->getUnderlyingInstr();
140 unsigned Opcode = OriginalInstr->getOpcode();
141 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
142 if (!all_of(Range&: Operands, P: [Opcode, Width](VPValue *Op) {
143 const Instruction *I = cast<VPInstruction>(Val: Op)->getUnderlyingInstr();
144 return I->getOpcode() == Opcode &&
145 I->getType()->getPrimitiveSizeInBits() == Width;
146 })) {
147 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
148 return false;
149 }
150
151 // For now, all operands must be defined in the same BB.
152 if (any_of(Range&: Operands, P: [this](VPValue *Op) {
153 return cast<VPInstruction>(Val: Op)->getParent() != &this->BB;
154 })) {
155 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
156 return false;
157 }
158
159 if (any_of(Range&: Operands,
160 P: [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
161 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
162 return false;
163 }
164
165 // For loads, check that there are no instructions writing to memory in
166 // between them.
167 // TODO: we only have to forbid instructions writing to memory that could
168 // interfere with any of the loads in the bundle
169 if (Opcode == Instruction::Load) {
170 unsigned LoadsSeen = 0;
171 VPBasicBlock *Parent = cast<VPInstruction>(Val: Operands[0])->getParent();
172 for (auto &I : *Parent) {
173 auto *VPI = dyn_cast<VPInstruction>(Val: &I);
174 if (!VPI)
175 break;
176 if (VPI->getOpcode() == Instruction::Load &&
177 llvm::is_contained(Range&: Operands, Element: VPI))
178 LoadsSeen++;
179
180 if (LoadsSeen == Operands.size())
181 break;
182 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
183 LLVM_DEBUG(
184 dbgs() << "VPSLP: instruction modifying memory between loads\n");
185 return false;
186 }
187 }
188
189 if (!all_of(Range&: Operands, P: [](VPValue *Op) {
190 return cast<LoadInst>(Val: cast<VPInstruction>(Val: Op)->getUnderlyingInstr())
191 ->isSimple();
192 })) {
193 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
194 return false;
195 }
196 }
197
198 if (Opcode == Instruction::Store)
199 if (!all_of(Range&: Operands, P: [](VPValue *Op) {
200 return cast<StoreInst>(Val: cast<VPInstruction>(Val: Op)->getUnderlyingInstr())
201 ->isSimple();
202 })) {
203 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
204 return false;
205 }
206
207 return true;
208}
209
210static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
211 unsigned OperandIndex) {
212 SmallVector<VPValue *, 4> Operands;
213 for (VPValue *V : Values) {
214 // Currently we only support VPInstructions.
215 auto *U = cast<VPInstruction>(Val: V);
216 Operands.push_back(Elt: U->getOperand(N: OperandIndex));
217 }
218 return Operands;
219}
220
221static bool areCommutative(ArrayRef<VPValue *> Values) {
222 return Instruction::isCommutative(
223 Opcode: cast<VPInstruction>(Val: Values[0])->getOpcode());
224}
225
226static SmallVector<SmallVector<VPValue *, 4>, 4>
227getOperands(ArrayRef<VPValue *> Values) {
228 SmallVector<SmallVector<VPValue *, 4>, 4> Result;
229 auto *VPI = cast<VPInstruction>(Val: Values[0]);
230
231 switch (VPI->getOpcode()) {
232 case Instruction::Load:
233 llvm_unreachable("Loads terminate a tree, no need to get operands");
234 case Instruction::Store:
235 Result.push_back(Elt: getOperands(Values, OperandIndex: 0));
236 break;
237 default:
238 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
239 Result.push_back(Elt: getOperands(Values, OperandIndex: I));
240 break;
241 }
242
243 return Result;
244}
245
246/// Returns the opcode of Values or ~0 if they do not all agree.
247static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
248 unsigned Opcode = cast<VPInstruction>(Val: Values[0])->getOpcode();
249 if (any_of(Range&: Values, P: [Opcode](VPValue *V) {
250 return cast<VPInstruction>(Val: V)->getOpcode() != Opcode;
251 }))
252 return std::nullopt;
253 return {Opcode};
254}
255
256/// Returns true if A and B access sequential memory if they are loads or
257/// stores or if they have identical opcodes otherwise.
258static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
259 VPInterleavedAccessInfo &IAI) {
260 if (A->getOpcode() != B->getOpcode())
261 return false;
262
263 if (A->getOpcode() != Instruction::Load &&
264 A->getOpcode() != Instruction::Store)
265 return true;
266 auto *GA = IAI.getInterleaveGroup(Instr: A);
267 auto *GB = IAI.getInterleaveGroup(Instr: B);
268
269 return GA && GB && GA == GB && GA->getIndex(Instr: A) + 1 == GB->getIndex(Instr: B);
270}
271
272/// Implements getLAScore from Listing 7 in the paper.
273/// Traverses and compares operands of V1 and V2 to MaxLevel.
274static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
275 VPInterleavedAccessInfo &IAI) {
276 auto *I1 = dyn_cast<VPInstruction>(Val: V1);
277 auto *I2 = dyn_cast<VPInstruction>(Val: V2);
278 // Currently we only support VPInstructions.
279 if (!I1 || !I2)
280 return 0;
281
282 if (MaxLevel == 0)
283 return (unsigned)areConsecutiveOrMatch(A: I1, B: I2, IAI);
284
285 unsigned Score = 0;
286 for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
287 for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
288 Score +=
289 getLAScore(V1: I1->getOperand(N: I), V2: I2->getOperand(N: J), MaxLevel: MaxLevel - 1, IAI);
290 return Score;
291}
292
293std::pair<VPlanSlp::OpMode, VPValue *>
294VPlanSlp::getBest(OpMode Mode, VPValue *Last,
295 SmallPtrSetImpl<VPValue *> &Candidates,
296 VPInterleavedAccessInfo &IAI) {
297 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
298 "Currently we only handle load and commutative opcodes");
299 LLVM_DEBUG(dbgs() << " getBest\n");
300
301 SmallVector<VPValue *, 4> BestCandidates;
302 LLVM_DEBUG(dbgs() << " Candidates for "
303 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
304 for (auto *Candidate : Candidates) {
305 auto *LastI = cast<VPInstruction>(Val: Last);
306 auto *CandidateI = cast<VPInstruction>(Val: Candidate);
307 if (areConsecutiveOrMatch(A: LastI, B: CandidateI, IAI)) {
308 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
309 << " ");
310 BestCandidates.push_back(Elt: Candidate);
311 }
312 }
313 LLVM_DEBUG(dbgs() << "\n");
314
315 if (BestCandidates.empty())
316 return {OpMode::Failed, nullptr};
317
318 if (BestCandidates.size() == 1)
319 return {Mode, BestCandidates[0]};
320
321 VPValue *Best = nullptr;
322 unsigned BestScore = 0;
323 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
324 unsigned PrevScore = ~0u;
325 bool AllSame = true;
326
327 // FIXME: Avoid visiting the same operands multiple times.
328 for (auto *Candidate : BestCandidates) {
329 unsigned Score = getLAScore(V1: Last, V2: Candidate, MaxLevel: Depth, IAI);
330 if (PrevScore == ~0u)
331 PrevScore = Score;
332 if (PrevScore != Score)
333 AllSame = false;
334 PrevScore = Score;
335
336 if (Score > BestScore) {
337 BestScore = Score;
338 Best = Candidate;
339 }
340 }
341 if (!AllSame)
342 break;
343 }
344 LLVM_DEBUG(dbgs() << "Found best "
345 << *cast<VPInstruction>(Best)->getUnderlyingInstr()
346 << "\n");
347 Candidates.erase(Ptr: Best);
348
349 return {Mode, Best};
350}
351
352SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
353 SmallVector<MultiNodeOpTy, 4> FinalOrder;
354 SmallVector<OpMode, 4> Mode;
355 FinalOrder.reserve(N: MultiNodeOps.size());
356 Mode.reserve(N: MultiNodeOps.size());
357
358 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
359
360 for (auto &Operands : MultiNodeOps) {
361 FinalOrder.push_back(Elt: {Operands.first, {Operands.second[0]}});
362 if (cast<VPInstruction>(Val: Operands.second[0])->getOpcode() ==
363 Instruction::Load)
364 Mode.push_back(Elt: OpMode::Load);
365 else
366 Mode.push_back(Elt: OpMode::Opcode);
367 }
368
369 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
370 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
371 SmallPtrSet<VPValue *, 4> Candidates;
372 LLVM_DEBUG(dbgs() << " Candidates ");
373 for (auto Ops : MultiNodeOps) {
374 LLVM_DEBUG(
375 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
376 << " ");
377 Candidates.insert(Ptr: Ops.second[Lane]);
378 }
379 LLVM_DEBUG(dbgs() << "\n");
380
381 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
382 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
383 if (Mode[Op] == OpMode::Failed)
384 continue;
385
386 VPValue *Last = FinalOrder[Op].second[Lane - 1];
387 std::pair<OpMode, VPValue *> Res =
388 getBest(Mode: Mode[Op], Last, Candidates, IAI);
389 if (Res.second)
390 FinalOrder[Op].second.push_back(Elt: Res.second);
391 else
392 // TODO: handle this case
393 FinalOrder[Op].second.push_back(Elt: markFailed());
394 }
395 }
396
397 return FinalOrder;
398}
399
400#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
401void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
402 dbgs() << " Ops: ";
403 for (auto *Op : Values) {
404 if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
405 if (auto *Instr = VPInstr->getUnderlyingInstr()) {
406 dbgs() << *Instr << " | ";
407 continue;
408 }
409 dbgs() << " nullptr | ";
410 }
411 dbgs() << "\n";
412}
413#endif
414
415VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
416 assert(!Values.empty() && "Need some operands!");
417
418 // If we already visited this instruction bundle, re-use the existing node
419 auto I = BundleToCombined.find(Val: to_vector<4>(Range&: Values));
420 if (I != BundleToCombined.end()) {
421#ifndef NDEBUG
422 // Check that the resulting graph is a tree. If we re-use a node, this means
423 // its values have multiple users. We only allow this, if all users of each
424 // value are the same instruction.
425 for (auto *V : Values) {
426 auto UI = V->user_begin();
427 auto *FirstUser = *UI++;
428 while (UI != V->user_end()) {
429 assert(*UI == FirstUser && "Currently we only support SLP trees.");
430 UI++;
431 }
432 }
433#endif
434 return I->second;
435 }
436
437 // Dump inputs
438 LLVM_DEBUG({
439 dbgs() << "buildGraph: ";
440 dumpBundle(Values);
441 });
442
443 if (!areVectorizable(Operands: Values))
444 return markFailed();
445
446 assert(getOpcode(Values) && "Opcodes for all values must match");
447 unsigned ValuesOpcode = *getOpcode(Values);
448
449 SmallVector<VPValue *, 4> CombinedOperands;
450 if (areCommutative(Values)) {
451 bool MultiNodeRoot = !MultiNodeActive;
452 MultiNodeActive = true;
453 for (auto &Operands : getOperands(Values)) {
454 LLVM_DEBUG({
455 dbgs() << " Visiting Commutative";
456 dumpBundle(Operands);
457 });
458
459 auto OperandsOpcode = getOpcode(Values: Operands);
460 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
461 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
462 CombinedOperands.push_back(Elt: buildGraph(Values: Operands));
463 } else {
464 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
465 // Create dummy VPInstruction, which will we replace later by the
466 // re-ordered operand.
467 VPInstruction *Op = new VPInstruction(0, {});
468 CombinedOperands.push_back(Elt: Op);
469 MultiNodeOps.emplace_back(Args&: Op, Args&: Operands);
470 }
471 }
472
473 if (MultiNodeRoot) {
474 LLVM_DEBUG(dbgs() << "Reorder \n");
475 MultiNodeActive = false;
476
477 auto FinalOrder = reorderMultiNodeOps();
478
479 MultiNodeOps.clear();
480 for (auto &Ops : FinalOrder) {
481 VPInstruction *NewOp = buildGraph(Values: Ops.second);
482 Ops.first->replaceAllUsesWith(New: NewOp);
483 for (unsigned i = 0; i < CombinedOperands.size(); i++)
484 if (CombinedOperands[i] == Ops.first)
485 CombinedOperands[i] = NewOp;
486 delete Ops.first;
487 Ops.first = NewOp;
488 }
489 LLVM_DEBUG(dbgs() << "Found final order\n");
490 }
491 } else {
492 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
493 if (ValuesOpcode == Instruction::Load)
494 for (VPValue *V : Values)
495 CombinedOperands.push_back(Elt: cast<VPInstruction>(Val: V)->getOperand(N: 0));
496 else
497 for (auto &Operands : getOperands(Values))
498 CombinedOperands.push_back(Elt: buildGraph(Values: Operands));
499 }
500
501 unsigned Opcode;
502 switch (ValuesOpcode) {
503 case Instruction::Load:
504 Opcode = VPInstruction::SLPLoad;
505 break;
506 case Instruction::Store:
507 Opcode = VPInstruction::SLPStore;
508 break;
509 default:
510 Opcode = ValuesOpcode;
511 break;
512 }
513
514 if (!CompletelySLP)
515 return markFailed();
516
517 assert(CombinedOperands.size() > 0 && "Need more some operands");
518 auto *Inst = cast<VPInstruction>(Val: Values[0])->getUnderlyingInstr();
519 auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
520
521 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
522 << "\n");
523 addCombined(Operands: Values, New: VPI);
524 return VPI;
525}
526