1 | //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// This file implements SLP analysis based on VPlan. The analysis is based on |
9 | /// the ideas described in |
10 | /// |
11 | /// Look-ahead SLP: auto-vectorization in the presence of commutative |
12 | /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, |
13 | /// Luís F. W. Góes |
14 | /// |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "VPlan.h" |
18 | #include "VPlanValue.h" |
19 | #include "llvm/ADT/DenseMap.h" |
20 | #include "llvm/ADT/SmallVector.h" |
21 | #include "llvm/Analysis/VectorUtils.h" |
22 | #include "llvm/IR/Instruction.h" |
23 | #include "llvm/IR/Instructions.h" |
24 | #include "llvm/IR/Type.h" |
25 | #include "llvm/IR/Value.h" |
26 | #include "llvm/Support/Casting.h" |
27 | #include "llvm/Support/Debug.h" |
28 | #include "llvm/Support/ErrorHandling.h" |
29 | #include "llvm/Support/raw_ostream.h" |
30 | #include <algorithm> |
31 | #include <cassert> |
32 | #include <optional> |
33 | #include <utility> |
34 | |
35 | using namespace llvm; |
36 | |
37 | #define DEBUG_TYPE "vplan-slp" |
38 | |
39 | // Number of levels to look ahead when re-ordering multi node operands. |
40 | static unsigned LookaheadMaxDepth = 5; |
41 | |
42 | VPInstruction *VPlanSlp::markFailed() { |
43 | // FIXME: Currently this is used to signal we hit instructions we cannot |
44 | // trivially SLP'ize. |
45 | CompletelySLP = false; |
46 | return nullptr; |
47 | } |
48 | |
49 | void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { |
50 | if (all_of(Range&: Operands, P: [](VPValue *V) { |
51 | return cast<VPInstruction>(Val: V)->getUnderlyingInstr(); |
52 | })) { |
53 | unsigned BundleSize = 0; |
54 | for (VPValue *V : Operands) { |
55 | Type *T = cast<VPInstruction>(Val: V)->getUnderlyingInstr()->getType(); |
56 | assert(!T->isVectorTy() && "Only scalar types supported for now" ); |
57 | BundleSize += T->getScalarSizeInBits(); |
58 | } |
59 | WidestBundleBits = std::max(a: WidestBundleBits, b: BundleSize); |
60 | } |
61 | |
62 | auto Res = BundleToCombined.try_emplace(Key: to_vector<4>(Range&: Operands), Args&: New); |
63 | assert(Res.second && |
64 | "Already created a combined instruction for the operand bundle" ); |
65 | (void)Res; |
66 | } |
67 | |
68 | bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { |
69 | // Currently we only support VPInstructions. |
70 | if (!all_of(Range&: Operands, P: [](VPValue *Op) { |
71 | return Op && isa<VPInstruction>(Val: Op) && |
72 | cast<VPInstruction>(Val: Op)->getUnderlyingInstr(); |
73 | })) { |
74 | LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n" ); |
75 | return false; |
76 | } |
77 | |
78 | // Check if opcodes and type width agree for all instructions in the bundle. |
79 | // FIXME: Differing widths/opcodes can be handled by inserting additional |
80 | // instructions. |
81 | // FIXME: Deal with non-primitive types. |
82 | const Instruction *OriginalInstr = |
83 | cast<VPInstruction>(Val: Operands[0])->getUnderlyingInstr(); |
84 | unsigned Opcode = OriginalInstr->getOpcode(); |
85 | unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); |
86 | if (!all_of(Range&: Operands, P: [Opcode, Width](VPValue *Op) { |
87 | const Instruction *I = cast<VPInstruction>(Val: Op)->getUnderlyingInstr(); |
88 | return I->getOpcode() == Opcode && |
89 | I->getType()->getPrimitiveSizeInBits() == Width; |
90 | })) { |
91 | LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n" ); |
92 | return false; |
93 | } |
94 | |
95 | // For now, all operands must be defined in the same BB. |
96 | if (any_of(Range&: Operands, P: [this](VPValue *Op) { |
97 | return cast<VPInstruction>(Val: Op)->getParent() != &this->BB; |
98 | })) { |
99 | LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n" ); |
100 | return false; |
101 | } |
102 | |
103 | if (any_of(Range&: Operands, |
104 | P: [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { |
105 | LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n" ); |
106 | return false; |
107 | } |
108 | |
109 | // For loads, check that there are no instructions writing to memory in |
110 | // between them. |
111 | // TODO: we only have to forbid instructions writing to memory that could |
112 | // interfere with any of the loads in the bundle |
113 | if (Opcode == Instruction::Load) { |
114 | unsigned LoadsSeen = 0; |
115 | VPBasicBlock *Parent = cast<VPInstruction>(Val: Operands[0])->getParent(); |
116 | for (auto &I : *Parent) { |
117 | auto *VPI = dyn_cast<VPInstruction>(Val: &I); |
118 | if (!VPI) |
119 | break; |
120 | if (VPI->getOpcode() == Instruction::Load && |
121 | llvm::is_contained(Range&: Operands, Element: VPI)) |
122 | LoadsSeen++; |
123 | |
124 | if (LoadsSeen == Operands.size()) |
125 | break; |
126 | if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { |
127 | LLVM_DEBUG( |
128 | dbgs() << "VPSLP: instruction modifying memory between loads\n" ); |
129 | return false; |
130 | } |
131 | } |
132 | |
133 | if (!all_of(Range&: Operands, P: [](VPValue *Op) { |
134 | return cast<LoadInst>(Val: cast<VPInstruction>(Val: Op)->getUnderlyingInstr()) |
135 | ->isSimple(); |
136 | })) { |
137 | LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n" ); |
138 | return false; |
139 | } |
140 | } |
141 | |
142 | if (Opcode == Instruction::Store) |
143 | if (!all_of(Range&: Operands, P: [](VPValue *Op) { |
144 | return cast<StoreInst>(Val: cast<VPInstruction>(Val: Op)->getUnderlyingInstr()) |
145 | ->isSimple(); |
146 | })) { |
147 | LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n" ); |
148 | return false; |
149 | } |
150 | |
151 | return true; |
152 | } |
153 | |
154 | static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, |
155 | unsigned OperandIndex) { |
156 | SmallVector<VPValue *, 4> Operands; |
157 | for (VPValue *V : Values) { |
158 | // Currently we only support VPInstructions. |
159 | auto *U = cast<VPInstruction>(Val: V); |
160 | Operands.push_back(Elt: U->getOperand(N: OperandIndex)); |
161 | } |
162 | return Operands; |
163 | } |
164 | |
165 | static bool areCommutative(ArrayRef<VPValue *> Values) { |
166 | return Instruction::isCommutative( |
167 | Opcode: cast<VPInstruction>(Val: Values[0])->getOpcode()); |
168 | } |
169 | |
170 | static SmallVector<SmallVector<VPValue *, 4>, 4> |
171 | getOperands(ArrayRef<VPValue *> Values) { |
172 | SmallVector<SmallVector<VPValue *, 4>, 4> Result; |
173 | auto *VPI = cast<VPInstruction>(Val: Values[0]); |
174 | |
175 | switch (VPI->getOpcode()) { |
176 | case Instruction::Load: |
177 | llvm_unreachable("Loads terminate a tree, no need to get operands" ); |
178 | case Instruction::Store: |
179 | Result.push_back(Elt: getOperands(Values, OperandIndex: 0)); |
180 | break; |
181 | default: |
182 | for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) |
183 | Result.push_back(Elt: getOperands(Values, OperandIndex: I)); |
184 | break; |
185 | } |
186 | |
187 | return Result; |
188 | } |
189 | |
190 | /// Returns the opcode of Values or ~0 if they do not all agree. |
191 | static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { |
192 | unsigned Opcode = cast<VPInstruction>(Val: Values[0])->getOpcode(); |
193 | if (any_of(Range&: Values, P: [Opcode](VPValue *V) { |
194 | return cast<VPInstruction>(Val: V)->getOpcode() != Opcode; |
195 | })) |
196 | return std::nullopt; |
197 | return {Opcode}; |
198 | } |
199 | |
200 | /// Returns true if A and B access sequential memory if they are loads or |
201 | /// stores or if they have identical opcodes otherwise. |
202 | static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, |
203 | VPInterleavedAccessInfo &IAI) { |
204 | if (A->getOpcode() != B->getOpcode()) |
205 | return false; |
206 | |
207 | if (A->getOpcode() != Instruction::Load && |
208 | A->getOpcode() != Instruction::Store) |
209 | return true; |
210 | auto *GA = IAI.getInterleaveGroup(Instr: A); |
211 | auto *GB = IAI.getInterleaveGroup(Instr: B); |
212 | |
213 | return GA && GB && GA == GB && GA->getIndex(Instr: A) + 1 == GB->getIndex(Instr: B); |
214 | } |
215 | |
216 | /// Implements getLAScore from Listing 7 in the paper. |
217 | /// Traverses and compares operands of V1 and V2 to MaxLevel. |
218 | static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, |
219 | VPInterleavedAccessInfo &IAI) { |
220 | auto *I1 = dyn_cast<VPInstruction>(Val: V1); |
221 | auto *I2 = dyn_cast<VPInstruction>(Val: V2); |
222 | // Currently we only support VPInstructions. |
223 | if (!I1 || !I2) |
224 | return 0; |
225 | |
226 | if (MaxLevel == 0) |
227 | return (unsigned)areConsecutiveOrMatch(A: I1, B: I2, IAI); |
228 | |
229 | unsigned Score = 0; |
230 | for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) |
231 | for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) |
232 | Score += |
233 | getLAScore(V1: I1->getOperand(N: I), V2: I2->getOperand(N: J), MaxLevel: MaxLevel - 1, IAI); |
234 | return Score; |
235 | } |
236 | |
237 | std::pair<VPlanSlp::OpMode, VPValue *> |
238 | VPlanSlp::getBest(OpMode Mode, VPValue *Last, |
239 | SmallPtrSetImpl<VPValue *> &Candidates, |
240 | VPInterleavedAccessInfo &IAI) { |
241 | assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && |
242 | "Currently we only handle load and commutative opcodes" ); |
243 | LLVM_DEBUG(dbgs() << " getBest\n" ); |
244 | |
245 | SmallVector<VPValue *, 4> BestCandidates; |
246 | LLVM_DEBUG(dbgs() << " Candidates for " |
247 | << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " " ); |
248 | for (auto *Candidate : Candidates) { |
249 | auto *LastI = cast<VPInstruction>(Val: Last); |
250 | auto *CandidateI = cast<VPInstruction>(Val: Candidate); |
251 | if (areConsecutiveOrMatch(A: LastI, B: CandidateI, IAI)) { |
252 | LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() |
253 | << " " ); |
254 | BestCandidates.push_back(Elt: Candidate); |
255 | } |
256 | } |
257 | LLVM_DEBUG(dbgs() << "\n" ); |
258 | |
259 | if (BestCandidates.empty()) |
260 | return {OpMode::Failed, nullptr}; |
261 | |
262 | if (BestCandidates.size() == 1) |
263 | return {Mode, BestCandidates[0]}; |
264 | |
265 | VPValue *Best = nullptr; |
266 | unsigned BestScore = 0; |
267 | for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { |
268 | unsigned PrevScore = ~0u; |
269 | bool AllSame = true; |
270 | |
271 | // FIXME: Avoid visiting the same operands multiple times. |
272 | for (auto *Candidate : BestCandidates) { |
273 | unsigned Score = getLAScore(V1: Last, V2: Candidate, MaxLevel: Depth, IAI); |
274 | if (PrevScore == ~0u) |
275 | PrevScore = Score; |
276 | if (PrevScore != Score) |
277 | AllSame = false; |
278 | PrevScore = Score; |
279 | |
280 | if (Score > BestScore) { |
281 | BestScore = Score; |
282 | Best = Candidate; |
283 | } |
284 | } |
285 | if (!AllSame) |
286 | break; |
287 | } |
288 | LLVM_DEBUG(dbgs() << "Found best " |
289 | << *cast<VPInstruction>(Best)->getUnderlyingInstr() |
290 | << "\n" ); |
291 | Candidates.erase(Ptr: Best); |
292 | |
293 | return {Mode, Best}; |
294 | } |
295 | |
296 | SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { |
297 | SmallVector<MultiNodeOpTy, 4> FinalOrder; |
298 | SmallVector<OpMode, 4> Mode; |
299 | FinalOrder.reserve(N: MultiNodeOps.size()); |
300 | Mode.reserve(N: MultiNodeOps.size()); |
301 | |
302 | LLVM_DEBUG(dbgs() << "Reordering multinode\n" ); |
303 | |
304 | for (auto &Operands : MultiNodeOps) { |
305 | FinalOrder.push_back(Elt: {Operands.first, {Operands.second[0]}}); |
306 | if (cast<VPInstruction>(Val: Operands.second[0])->getOpcode() == |
307 | Instruction::Load) |
308 | Mode.push_back(Elt: OpMode::Load); |
309 | else |
310 | Mode.push_back(Elt: OpMode::Opcode); |
311 | } |
312 | |
313 | for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { |
314 | LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n" ); |
315 | SmallPtrSet<VPValue *, 4> Candidates; |
316 | LLVM_DEBUG(dbgs() << " Candidates " ); |
317 | for (auto Ops : MultiNodeOps) { |
318 | LLVM_DEBUG( |
319 | dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() |
320 | << " " ); |
321 | Candidates.insert(Ptr: Ops.second[Lane]); |
322 | } |
323 | LLVM_DEBUG(dbgs() << "\n" ); |
324 | |
325 | for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { |
326 | LLVM_DEBUG(dbgs() << " Checking " << Op << "\n" ); |
327 | if (Mode[Op] == OpMode::Failed) |
328 | continue; |
329 | |
330 | VPValue *Last = FinalOrder[Op].second[Lane - 1]; |
331 | std::pair<OpMode, VPValue *> Res = |
332 | getBest(Mode: Mode[Op], Last, Candidates, IAI); |
333 | if (Res.second) |
334 | FinalOrder[Op].second.push_back(Elt: Res.second); |
335 | else |
336 | // TODO: handle this case |
337 | FinalOrder[Op].second.push_back(Elt: markFailed()); |
338 | } |
339 | } |
340 | |
341 | return FinalOrder; |
342 | } |
343 | |
344 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
345 | void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { |
346 | dbgs() << " Ops: " ; |
347 | for (auto *Op : Values) { |
348 | if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) |
349 | if (auto *Instr = VPInstr->getUnderlyingInstr()) { |
350 | dbgs() << *Instr << " | " ; |
351 | continue; |
352 | } |
353 | dbgs() << " nullptr | " ; |
354 | } |
355 | dbgs() << "\n" ; |
356 | } |
357 | #endif |
358 | |
359 | VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { |
360 | assert(!Values.empty() && "Need some operands!" ); |
361 | |
362 | // If we already visited this instruction bundle, re-use the existing node |
363 | auto I = BundleToCombined.find(Val: to_vector<4>(Range&: Values)); |
364 | if (I != BundleToCombined.end()) { |
365 | #ifndef NDEBUG |
366 | // Check that the resulting graph is a tree. If we re-use a node, this means |
367 | // its values have multiple users. We only allow this, if all users of each |
368 | // value are the same instruction. |
369 | for (auto *V : Values) { |
370 | auto UI = V->user_begin(); |
371 | auto *FirstUser = *UI++; |
372 | while (UI != V->user_end()) { |
373 | assert(*UI == FirstUser && "Currently we only support SLP trees." ); |
374 | UI++; |
375 | } |
376 | } |
377 | #endif |
378 | return I->second; |
379 | } |
380 | |
381 | // Dump inputs |
382 | LLVM_DEBUG({ |
383 | dbgs() << "buildGraph: " ; |
384 | dumpBundle(Values); |
385 | }); |
386 | |
387 | if (!areVectorizable(Operands: Values)) |
388 | return markFailed(); |
389 | |
390 | assert(getOpcode(Values) && "Opcodes for all values must match" ); |
391 | unsigned ValuesOpcode = *getOpcode(Values); |
392 | |
393 | SmallVector<VPValue *, 4> CombinedOperands; |
394 | if (areCommutative(Values)) { |
395 | bool MultiNodeRoot = !MultiNodeActive; |
396 | MultiNodeActive = true; |
397 | for (auto &Operands : getOperands(Values)) { |
398 | LLVM_DEBUG({ |
399 | dbgs() << " Visiting Commutative" ; |
400 | dumpBundle(Operands); |
401 | }); |
402 | |
403 | auto OperandsOpcode = getOpcode(Values: Operands); |
404 | if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { |
405 | LLVM_DEBUG(dbgs() << " Same opcode, continue building\n" ); |
406 | CombinedOperands.push_back(Elt: buildGraph(Values: Operands)); |
407 | } else { |
408 | LLVM_DEBUG(dbgs() << " Adding multinode Ops\n" ); |
409 | // Create dummy VPInstruction, which will we replace later by the |
410 | // re-ordered operand. |
411 | VPInstruction *Op = new VPInstruction(0, {}); |
412 | CombinedOperands.push_back(Elt: Op); |
413 | MultiNodeOps.emplace_back(Args&: Op, Args&: Operands); |
414 | } |
415 | } |
416 | |
417 | if (MultiNodeRoot) { |
418 | LLVM_DEBUG(dbgs() << "Reorder \n" ); |
419 | MultiNodeActive = false; |
420 | |
421 | auto FinalOrder = reorderMultiNodeOps(); |
422 | |
423 | MultiNodeOps.clear(); |
424 | for (auto &Ops : FinalOrder) { |
425 | VPInstruction *NewOp = buildGraph(Values: Ops.second); |
426 | Ops.first->replaceAllUsesWith(New: NewOp); |
427 | for (unsigned i = 0; i < CombinedOperands.size(); i++) |
428 | if (CombinedOperands[i] == Ops.first) |
429 | CombinedOperands[i] = NewOp; |
430 | delete Ops.first; |
431 | Ops.first = NewOp; |
432 | } |
433 | LLVM_DEBUG(dbgs() << "Found final order\n" ); |
434 | } |
435 | } else { |
436 | LLVM_DEBUG(dbgs() << " NonCommuntative\n" ); |
437 | if (ValuesOpcode == Instruction::Load) |
438 | for (VPValue *V : Values) |
439 | CombinedOperands.push_back(Elt: cast<VPInstruction>(Val: V)->getOperand(N: 0)); |
440 | else |
441 | for (auto &Operands : getOperands(Values)) |
442 | CombinedOperands.push_back(Elt: buildGraph(Values: Operands)); |
443 | } |
444 | |
445 | unsigned Opcode; |
446 | switch (ValuesOpcode) { |
447 | case Instruction::Load: |
448 | Opcode = VPInstruction::SLPLoad; |
449 | break; |
450 | case Instruction::Store: |
451 | Opcode = VPInstruction::SLPStore; |
452 | break; |
453 | default: |
454 | Opcode = ValuesOpcode; |
455 | break; |
456 | } |
457 | |
458 | if (!CompletelySLP) |
459 | return markFailed(); |
460 | |
461 | assert(CombinedOperands.size() > 0 && "Need more some operands" ); |
462 | auto *Inst = cast<VPInstruction>(Val: Values[0])->getUnderlyingInstr(); |
463 | auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); |
464 | |
465 | LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " |
466 | << *cast<VPInstruction>(Values[0]) << "\n" ); |
467 | addCombined(Operands: Values, New: VPI); |
468 | return VPI; |
469 | } |
470 | |