1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/MapVector.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/Analysis/AliasAnalysis.h"
22#include "llvm/Analysis/AssumeBundleQueries.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/InstSimplifyFolder.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27#include "llvm/Analysis/ScalarEvolutionExpressions.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/ValueTracking.h"
30#include "llvm/Analysis/VectorUtils.h"
31#include "llvm/CodeGen/TargetPassConfig.h"
32#include "llvm/CodeGen/ValueTypes.h"
33#include "llvm/IR/Dominators.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/IntrinsicInst.h"
36#include "llvm/IR/Intrinsics.h"
37#include "llvm/IR/IntrinsicsHexagon.h"
38#include "llvm/IR/Metadata.h"
39#include "llvm/IR/PatternMatch.h"
40#include "llvm/InitializePasses.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/KnownBits.h"
44#include "llvm/Support/MathExtras.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Target/TargetMachine.h"
47#include "llvm/Transforms/Utils/Local.h"
48
49#include "Hexagon.h"
50#include "HexagonSubtarget.h"
51#include "HexagonTargetMachine.h"
52
53#include <algorithm>
54#include <deque>
55#include <map>
56#include <optional>
57#include <set>
58#include <utility>
59#include <vector>
60
61#define DEBUG_TYPE "hexagon-vc"
62
63// This is a const that represents default HVX VTCM page size.
64// It is boot time configurable, so we probably want an API to
65// read it, but for now assume 128KB
66#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
67
68using namespace llvm;
69
70namespace {
71cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
72cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(Val: true)); // Align
73cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(Val: true)); // Idioms
74cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
75
76cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
77 cl::init(Val: ~0));
78cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
79 cl::init(Val: ~0));
80cl::opt<unsigned>
81 MinLoadGroupSizeForAlignment("hvc-ld-min-group-size-for-alignment",
82 cl::Hidden, cl::init(Val: 4));
83
84class HexagonVectorCombine {
85public:
86 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
87 DominatorTree &DT_, ScalarEvolution &SE_,
88 TargetLibraryInfo &TLI_, const TargetMachine &TM_,
89 OptimizationRemarkEmitter &ORE_)
90 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_), SE(SE_),
91 TLI(TLI_),
92 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))),
93 ORE(ORE_) {}
94
95 bool run();
96
97 // Common integer type.
98 IntegerType *getIntTy(unsigned Width = 32) const;
99 // Byte type: either scalar (when Length = 0), or vector with given
100 // element count.
101 Type *getByteTy(int ElemCount = 0) const;
102 // Boolean type: either scalar (when Length = 0), or vector with given
103 // element count.
104 Type *getBoolTy(int ElemCount = 0) const;
105 // Create a ConstantInt of type returned by getIntTy with the value Val.
106 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
107 // Get the integer value of V, if it exists.
108 std::optional<APInt> getIntValue(const Value *Val) const;
109 // Is Val a constant 0, or a vector of 0s?
110 bool isZero(const Value *Val) const;
111 // Is Val an undef value?
112 bool isUndef(const Value *Val) const;
113 // Is Val a scalar (i1 true) or a vector of (i1 true)?
114 bool isTrue(const Value *Val) const;
115 // Is Val a scalar (i1 false) or a vector of (i1 false)?
116 bool isFalse(const Value *Val) const;
117
118 // Get HVX vector type with the given element type.
119 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
120
121 enum SizeKind {
122 Store, // Store size
123 Alloc, // Alloc size
124 };
125 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
126 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
127 int getTypeAlignment(Type *Ty) const;
128 size_t length(Value *Val) const;
129 size_t length(Type *Ty) const;
130
131 Value *simplify(Value *Val) const;
132
133 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
134 int Length, int Where) const;
135 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
136 Value *Amt) const;
137 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
138 Value *Amt) const;
139 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
140 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
141 Value *Pad) const;
142 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
143 Type *ToTy) const;
144 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
145 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
146 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
147 unsigned Length) const;
148 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
149 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
150 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
151 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
152
153 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
154 Type *RetTy, ArrayRef<Value *> Args,
155 ArrayRef<Type *> ArgTys = {},
156 ArrayRef<Value *> MDSources = {}) const;
157 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
158 unsigned ToWidth) const;
159 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
160 VectorType *ToType) const;
161
162 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
163
164 unsigned getNumSignificantBits(const Value *V,
165 const Instruction *CtxI = nullptr) const;
166 KnownBits getKnownBits(const Value *V,
167 const Instruction *CtxI = nullptr) const;
168
169 bool isSafeToClone(const Instruction &In) const;
170
171 template <typename T = std::vector<Instruction *>>
172 bool isSafeToMoveBeforeInBB(const Instruction &In,
173 BasicBlock::const_iterator To,
174 const T &IgnoreInsts = {}) const;
175
176 // This function is only used for assertions at the moment.
177 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
178
179 Function &F;
180 const DataLayout &DL;
181 AliasAnalysis &AA;
182 AssumptionCache &AC;
183 DominatorTree &DT;
184 ScalarEvolution &SE;
185 TargetLibraryInfo &TLI;
186 const HexagonSubtarget &HST;
187 OptimizationRemarkEmitter &ORE;
188
189private:
190 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
191 int Start, int Length) const;
192};
193
194class AlignVectors {
195 // This code tries to replace unaligned vector loads/stores with aligned
196 // ones.
197 // Consider unaligned load:
198 // %v = original_load %some_addr, align <bad>
199 // %user = %v
200 // It will generate
201 // = load ..., align <good>
202 // = load ..., align <good>
203 // = valign
204 // etc.
205 // %synthesize = combine/shuffle the loaded data so that it looks
206 // exactly like what "original_load" has loaded.
207 // %user = %synthesize
208 // Similarly for stores.
209public:
210 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
211
212 bool run();
213
214private:
215 using InstList = std::vector<Instruction *>;
216 using InstMap = DenseMap<Instruction *, Instruction *>;
217
218 struct AddrInfo {
219 AddrInfo(const AddrInfo &) = default;
220 AddrInfo &operator=(const AddrInfo &) = default;
221 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
222 Align H)
223 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
224 NeedAlign(HVC.getTypeAlignment(Ty: ValTy)) {}
225
226 // XXX: add Size member?
227 Instruction *Inst;
228 Value *Addr;
229 Type *ValTy;
230 Align HaveAlign;
231 Align NeedAlign;
232 int Offset = 0; // Offset (in bytes) from the first member of the
233 // containing AddrList.
234 };
235 using AddrList = std::vector<AddrInfo>;
236
237 struct InstrLess {
238 bool operator()(const Instruction *A, const Instruction *B) const {
239 return A->comesBefore(Other: B);
240 }
241 };
242 using DepList = std::set<Instruction *, InstrLess>;
243
244 struct MoveGroup {
245 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
246 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
247 MoveGroup() = default;
248 Instruction *Base; // Base instruction of the parent address group.
249 InstList Main; // Main group of instructions.
250 InstList Deps; // List of dependencies.
251 InstMap Clones; // Map from original Deps to cloned ones.
252 bool IsHvx; // Is this group of HVX instructions?
253 bool IsLoad; // Is this a load group?
254 };
255 using MoveList = std::vector<MoveGroup>;
256
257 struct ByteSpan {
258 // A representation of "interesting" bytes within a given span of memory.
259 // These bytes are those that are loaded or stored, and they don't have
260 // to cover the entire span of memory.
261 //
262 // The representation works by picking a contiguous sequence of bytes
263 // from somewhere within a llvm::Value, and placing it at a given offset
264 // within the span.
265 //
266 // The sequence of bytes from llvm:Value is represented by Segment.
267 // Block is Segment, plus where it goes in the span.
268 //
269 // An important feature of ByteSpan is being able to make a "section",
270 // i.e. creating another ByteSpan corresponding to a range of offsets
271 // relative to the source span.
272
273 struct Segment {
274 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
275 Segment(Value *Val, int Begin, int Len)
276 : Val(Val), Start(Begin), Size(Len) {}
277 Segment(const Segment &Seg) = default;
278 Segment &operator=(const Segment &Seg) = default;
279 Value *Val; // Value representable as a sequence of bytes.
280 int Start; // First byte of the value that belongs to the segment.
281 int Size; // Number of bytes in the segment.
282 };
283
284 struct Block {
285 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
286 Block(Value *Val, int Off, int Len, int Pos)
287 : Seg(Val, Off, Len), Pos(Pos) {}
288 Block(const Block &Blk) = default;
289 Block &operator=(const Block &Blk) = default;
290 Segment Seg; // Value segment.
291 int Pos; // Position (offset) of the block in the span.
292 };
293
294 int extent() const;
295 ByteSpan section(int Start, int Length) const;
296 ByteSpan &shift(int Offset);
297 SmallVector<Value *, 8> values() const;
298
299 int size() const { return Blocks.size(); }
300 Block &operator[](int i) { return Blocks[i]; }
301 const Block &operator[](int i) const { return Blocks[i]; }
302
303 std::vector<Block> Blocks;
304
305 using iterator = decltype(Blocks)::iterator;
306 iterator begin() { return Blocks.begin(); }
307 iterator end() { return Blocks.end(); }
308 using const_iterator = decltype(Blocks)::const_iterator;
309 const_iterator begin() const { return Blocks.begin(); }
310 const_iterator end() const { return Blocks.end(); }
311 };
312
313 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
314 bool isHvx(const AddrInfo &AI) const;
315 // This function is only used for assertions at the moment.
316 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
317
318 Value *getPayload(Value *Val) const;
319 Value *getMask(Value *Val) const;
320 Value *getPassThrough(Value *Val) const;
321
322 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
323 int Adjust,
324 const InstMap &CloneMap = InstMap()) const;
325 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
326 int Alignment,
327 const InstMap &CloneMap = InstMap()) const;
328
329 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
330 Value *Predicate, int Alignment, Value *Mask,
331 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
332 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
333 int Alignment,
334 ArrayRef<Value *> MDSources = {}) const;
335
336 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
337 Value *Predicate, int Alignment, Value *Mask,
338 ArrayRef<Value *> MDSources = {}) const;
339 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
340 int Alignment,
341 ArrayRef<Value *> MDSources = {}) const;
342
343 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
344 Value *Predicate, int Alignment,
345 ArrayRef<Value *> MDSources = {}) const;
346 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
347 Value *Predicate, int Alignment,
348 ArrayRef<Value *> MDSources = {}) const;
349
350 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
351 bool createAddressGroups();
352 MoveList createLoadGroups(const AddrList &Group) const;
353 MoveList createStoreGroups(const AddrList &Group) const;
354 bool moveTogether(MoveGroup &Move) const;
355 template <typename T>
356 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
357
358 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
359 int ScLen, Value *AlignVal, Value *AlignAddr) const;
360 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
361 int ScLen, Value *AlignVal, Value *AlignAddr) const;
362 bool realignGroup(const MoveGroup &Move);
363 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
364 int Alignment) const;
365
366 using AddrGroupMap = MapVector<Instruction *, AddrList>;
367 AddrGroupMap AddrGroups;
368
369 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
370 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
371 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
372 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
373 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
374 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
375 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
376 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
377 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
378 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
379 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
380 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
381 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
382 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
383
384 const HexagonVectorCombine &HVC;
385};
386
387[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
388 const AlignVectors::AddrGroupMap &AG) {
389 OS << "Printing AddrGroups:"
390 << "\n";
391 for (auto &It : AG) {
392 OS << "\n\tInstruction: ";
393 It.first->dump();
394 OS << "\n\tAddrInfo: ";
395 for (auto &AI : It.second)
396 OS << AI << "\n";
397 }
398 return OS;
399}
400
401[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
402 const AlignVectors::AddrList &AL) {
403 OS << "\n *** Addr List: ***\n";
404 for (auto &AG : AL) {
405 OS << "\n *** Addr Group: ***\n";
406 OS << AG;
407 OS << "\n";
408 }
409 return OS;
410}
411
412[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
413 const AlignVectors::AddrInfo &AI) {
414 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
415 OS << "Addr: " << *AI.Addr << '\n';
416 OS << "Type: " << *AI.ValTy << '\n';
417 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
418 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
419 OS << "Offset: " << AI.Offset;
420 return OS;
421}
422
423[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
424 const AlignVectors::MoveList &ML) {
425 OS << "\n *** Move List: ***\n";
426 for (auto &MG : ML) {
427 OS << "\n *** Move Group: ***\n";
428 OS << MG;
429 OS << "\n";
430 }
431 return OS;
432}
433
434[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
435 const AlignVectors::MoveGroup &MG) {
436 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
437 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
438 OS << "Main\n";
439 for (Instruction *I : MG.Main)
440 OS << " " << *I << '\n';
441 OS << "Deps\n";
442 for (Instruction *I : MG.Deps)
443 OS << " " << *I << '\n';
444 OS << "Clones\n";
445 for (auto [K, V] : MG.Clones) {
446 OS << " ";
447 K->printAsOperand(O&: OS, PrintType: false);
448 OS << "\t-> " << *V << '\n';
449 }
450 return OS;
451}
452
453[[maybe_unused]] raw_ostream &
454operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
455 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
456 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
457 OS << "(self:" << B.Seg.Val << ')';
458 } else if (B.Seg.Val != nullptr) {
459 OS << *B.Seg.Val;
460 } else {
461 OS << "(null)";
462 }
463 return OS;
464}
465
466[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
467 const AlignVectors::ByteSpan &BS) {
468 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
469 for (const AlignVectors::ByteSpan::Block &B : BS)
470 OS << B << '\n';
471 OS << ']';
472 return OS;
473}
474
475class HvxIdioms {
476public:
477 enum DstQualifier {
478 Undefined = 0,
479 Arithmetic,
480 LdSt,
481 LLVM_Gather,
482 LLVM_Scatter,
483 HEX_Gather_Scatter,
484 HEX_Gather,
485 HEX_Scatter,
486 Call
487 };
488
489 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
490 auto *Int32Ty = HVC.getIntTy(Width: 32);
491 HvxI32Ty = HVC.getHvxTy(ElemTy: Int32Ty, /*Pair=*/false);
492 HvxP32Ty = HVC.getHvxTy(ElemTy: Int32Ty, /*Pair=*/true);
493 }
494
495 bool run();
496
497private:
498 enum Signedness { Positive, Signed, Unsigned };
499
500 // Value + sign
501 // This is to keep track of whether the value should be treated as signed
502 // or unsigned, or is known to be positive.
503 struct SValue {
504 Value *Val;
505 Signedness Sgn;
506 };
507
508 struct FxpOp {
509 unsigned Opcode;
510 unsigned Frac; // Number of fraction bits
511 SValue X, Y;
512 // If present, add 1 << RoundAt before shift:
513 std::optional<unsigned> RoundAt;
514 VectorType *ResTy;
515 };
516
517 auto getNumSignificantBits(Value *V, Instruction *In) const
518 -> std::pair<unsigned, Signedness>;
519 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
520
521 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
522 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
523
524 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
525 const FxpOp &Op) const -> Value *;
526 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
527 bool Rounding) const -> Value *;
528 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
529 bool Rounding) const -> Value *;
530 // Return {Result, Carry}, where Carry is a vector predicate.
531 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
532 Value *CarryIn = nullptr) const
533 -> std::pair<Value *, Value *>;
534 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
535 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
536 -> Value *;
537 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
538 -> std::pair<Value *, Value *>;
539 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
540 ArrayRef<Value *> WordY) const -> SmallVector<Value *>;
541 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
542 Signedness SgnX, ArrayRef<Value *> WordY,
543 Signedness SgnY) const -> SmallVector<Value *>;
544
545 bool matchMLoad(Instruction &In) const;
546 bool matchMStore(Instruction &In) const;
547 Value *processMLoad(Instruction &In) const;
548 Value *processMStore(Instruction &In) const;
549 std::optional<uint64_t> getAlignment(Instruction &In, Value *ptr) const;
550 std::optional<uint64_t>
551 getAlignmentImpl(Instruction &In, Value *ptr,
552 SmallPtrSet<Value *, 16> &Visited) const;
553 std::optional<uint64_t> getPHIBaseMinAlignment(Instruction &In,
554 PHINode *PN) const;
555
556 // Vector manipulations for Ripple
557 bool matchScatter(Instruction &In) const;
558 bool matchGather(Instruction &In) const;
559 Value *processVScatter(Instruction &In) const;
560 Value *processVGather(Instruction &In) const;
561
562 VectorType *HvxI32Ty;
563 VectorType *HvxP32Ty;
564 const HexagonVectorCombine &HVC;
565
566 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
567};
568
569[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
570 const HvxIdioms::FxpOp &Op) {
571 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
572 OS << Instruction::getOpcodeName(Opcode: Op.Opcode) << '.' << Op.Frac;
573 if (Op.RoundAt.has_value()) {
574 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
575 OS << ":rnd";
576 } else {
577 OS << " + 1<<" << *Op.RoundAt;
578 }
579 }
580 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
581 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
582 return OS;
583}
584
585} // namespace
586
587namespace {
588
589template <typename T> T *getIfUnordered(T *MaybeT) {
590 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
591}
592template <typename T> T *isCandidate(Instruction *In) {
593 return dyn_cast<T>(In);
594}
595template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
596 return getIfUnordered(MaybeT: dyn_cast<LoadInst>(Val: In));
597}
598template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
599 return getIfUnordered(MaybeT: dyn_cast<StoreInst>(Val: In));
600}
601
602// Forward other erase_ifs to the LLVM implementations.
603template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
604 llvm::erase_if(std::forward<T>(container), p);
605}
606
607} // namespace
608
609// --- Begin AlignVectors
610
611// For brevity, only consider loads. We identify a group of loads where we
612// know the relative differences between their addresses, so we know how they
613// are laid out in memory (relative to one another). These loads can overlap,
614// can be shorter or longer than the desired vector length.
615// Ultimately we want to generate a sequence of aligned loads that will load
616// every byte that the original loads loaded, and have the program use these
617// loaded values instead of the original loads.
618// We consider the contiguous memory area spanned by all these loads.
619//
620// Let's say that a single aligned vector load can load 16 bytes at a time.
621// If the program wanted to use a byte at offset 13 from the beginning of the
622// original span, it will be a byte at offset 13+x in the aligned data for
623// some x>=0. This may happen to be in the first aligned load, or in the load
624// following it. Since we generally don't know what the that alignment value
625// is at compile time, we proactively do valigns on the aligned loads, so that
626// byte that was at offset 13 is still at offset 13 after the valigns.
627//
628// This will be the starting point for making the rest of the program use the
629// data loaded by the new loads.
630// For each original load, and its users:
631// %v = load ...
632// ... = %v
633// ... = %v
634// we create
635// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
636// it contains the same value as %v did before
637// then replace all users of %v with %new_v.
638// ... = %new_v
639// ... = %new_v
640
641auto AlignVectors::ByteSpan::extent() const -> int {
642 if (size() == 0)
643 return 0;
644 int Min = Blocks[0].Pos;
645 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
646 for (int i = 1, e = size(); i != e; ++i) {
647 Min = std::min(a: Min, b: Blocks[i].Pos);
648 Max = std::max(a: Max, b: Blocks[i].Pos + Blocks[i].Seg.Size);
649 }
650 return Max - Min;
651}
652
653auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
654 ByteSpan Section;
655 for (const ByteSpan::Block &B : Blocks) {
656 int L = std::max(a: B.Pos, b: Start); // Left end.
657 int R = std::min(a: B.Pos + B.Seg.Size, b: Start + Length); // Right end+1.
658 if (L < R) {
659 // How much to chop off the beginning of the segment:
660 int Off = L > B.Pos ? L - B.Pos : 0;
661 Section.Blocks.emplace_back(args: B.Seg.Val, args: B.Seg.Start + Off, args: R - L, args&: L);
662 }
663 }
664 return Section;
665}
666
667auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
668 for (Block &B : Blocks)
669 B.Pos += Offset;
670 return *this;
671}
672
673auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
674 SmallVector<Value *, 8> Values(Blocks.size());
675 for (int i = 0, e = Blocks.size(); i != e; ++i)
676 Values[i] = Blocks[i].Seg.Val;
677 return Values;
678}
679
680// Turn a requested integer alignment into the effective Align to use.
681// If Requested == 0 -> use ABI alignment of the value type (old semantics).
682// 0 means "ABI alignment" in old IR.
683static Align effectiveAlignForValueTy(const DataLayout &DL, Type *ValTy,
684 int Requested) {
685 if (Requested > 0)
686 return Align(static_cast<uint64_t>(Requested));
687 return Align(DL.getABITypeAlign(Ty: ValTy).value());
688}
689
690auto AlignVectors::getAddrInfo(Instruction &In) const
691 -> std::optional<AddrInfo> {
692 if (auto *L = isCandidate<LoadInst>(In: &In))
693 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
694 L->getAlign());
695 if (auto *S = isCandidate<StoreInst>(In: &In))
696 return AddrInfo(HVC, S, S->getPointerOperand(),
697 S->getValueOperand()->getType(), S->getAlign());
698 if (auto *II = isCandidate<IntrinsicInst>(In: &In)) {
699 Intrinsic::ID ID = II->getIntrinsicID();
700 switch (ID) {
701 case Intrinsic::masked_load:
702 return AddrInfo(HVC, II, II->getArgOperand(i: 0), II->getType(),
703 II->getParamAlign(ArgNo: 0).valueOrOne());
704 case Intrinsic::masked_store:
705 return AddrInfo(HVC, II, II->getArgOperand(i: 1),
706 II->getArgOperand(i: 0)->getType(),
707 II->getParamAlign(ArgNo: 1).valueOrOne());
708 }
709 }
710 return std::nullopt;
711}
712
713auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
714 return HVC.HST.isTypeForHVX(VecTy: AI.ValTy);
715}
716
717auto AlignVectors::getPayload(Value *Val) const -> Value * {
718 if (auto *In = dyn_cast<Instruction>(Val)) {
719 Intrinsic::ID ID = 0;
720 if (auto *II = dyn_cast<IntrinsicInst>(Val: In))
721 ID = II->getIntrinsicID();
722 if (isa<StoreInst>(Val: In) || ID == Intrinsic::masked_store)
723 return In->getOperand(i: 0);
724 }
725 return Val;
726}
727
728auto AlignVectors::getMask(Value *Val) const -> Value * {
729 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
730 switch (II->getIntrinsicID()) {
731 case Intrinsic::masked_load:
732 return II->getArgOperand(i: 1);
733 case Intrinsic::masked_store:
734 return II->getArgOperand(i: 2);
735 }
736 }
737
738 Type *ValTy = getPayload(Val)->getType();
739 if (auto *VecTy = dyn_cast<VectorType>(Val: ValTy))
740 return Constant::getAllOnesValue(Ty: HVC.getBoolTy(ElemCount: HVC.length(Ty: VecTy)));
741 return Constant::getAllOnesValue(Ty: HVC.getBoolTy());
742}
743
744auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
745 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
746 if (II->getIntrinsicID() == Intrinsic::masked_load)
747 return II->getArgOperand(i: 2);
748 }
749 return UndefValue::get(T: getPayload(Val)->getType());
750}
751
752auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
753 Type *ValTy, int Adjust,
754 const InstMap &CloneMap) const
755 -> Value * {
756 if (auto *I = dyn_cast<Instruction>(Val: Ptr))
757 if (Instruction *New = CloneMap.lookup(Val: I))
758 Ptr = New;
759 return Builder.CreatePtrAdd(Ptr, Offset: HVC.getConstInt(Val: Adjust), Name: "gep");
760}
761
762auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
763 Type *ValTy, int Alignment,
764 const InstMap &CloneMap) const
765 -> Value * {
766 auto remap = [&](Value *V) -> Value * {
767 if (auto *I = dyn_cast<Instruction>(Val: V)) {
768 for (auto [Old, New] : CloneMap)
769 I->replaceUsesOfWith(From: Old, To: New);
770 return I;
771 }
772 return V;
773 };
774 Value *AsInt = Builder.CreatePtrToInt(V: Ptr, DestTy: HVC.getIntTy(), Name: "pti");
775 Value *Mask = HVC.getConstInt(Val: -Alignment);
776 Value *And = Builder.CreateAnd(LHS: remap(AsInt), RHS: Mask, Name: "and");
777 return Builder.CreateIntToPtr(
778 V: And, DestTy: PointerType::getUnqual(C&: ValTy->getContext()), Name: "itp");
779}
780
781auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
782 Value *Predicate, int Alignment, Value *Mask,
783 Value *PassThru,
784 ArrayRef<Value *> MDSources) const -> Value * {
785 // Predicate is nullptr if not creating predicated load
786 if (Predicate) {
787 assert(!Predicate->getType()->isVectorTy() &&
788 "Expectning scalar predicate");
789 if (HVC.isFalse(Val: Predicate))
790 return UndefValue::get(T: ValTy);
791 if (!HVC.isTrue(Val: Predicate)) {
792 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
793 Alignment, MDSources);
794 return Builder.CreateSelect(C: Mask, True: Load, False: PassThru);
795 }
796 // Predicate == true here.
797 }
798 assert(!HVC.isUndef(Mask)); // Should this be allowed?
799 if (HVC.isZero(Val: Mask))
800 return PassThru;
801
802 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy, Requested: Alignment);
803 if (HVC.isTrue(Val: Mask))
804 return createSimpleLoad(Builder, ValTy, Ptr, Alignment: EffA.value(), MDSources);
805
806 Instruction *Load =
807 Builder.CreateMaskedLoad(Ty: ValTy, Ptr, Alignment: EffA, Mask, PassThru, Name: "mld");
808 LLVM_DEBUG(dbgs() << "\t[Creating masked Load:] "; Load->dump());
809 propagateMetadata(I: Load, VL: MDSources);
810 return Load;
811}
812
813auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
814 Value *Ptr, int Alignment,
815 ArrayRef<Value *> MDSources) const
816 -> Value * {
817 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy, Requested: Alignment);
818 Instruction *Load = Builder.CreateAlignedLoad(Ty: ValTy, Ptr, Align: EffA, Name: "ald");
819 propagateMetadata(I: Load, VL: MDSources);
820 LLVM_DEBUG(dbgs() << "\t[Creating Load:] "; Load->dump());
821 return Load;
822}
823
824auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
825 Value *Ptr, Value *Predicate,
826 int Alignment,
827 ArrayRef<Value *> MDSources) const
828 -> Value * {
829 assert(HVC.HST.isTypeForHVX(ValTy) &&
830 "Predicates 'scalar' vector loads not yet supported");
831 assert(Predicate);
832 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
833 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy, Requested: Alignment);
834 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % EffA.value() == 0);
835
836 if (HVC.isFalse(Val: Predicate))
837 return UndefValue::get(T: ValTy);
838 if (HVC.isTrue(Val: Predicate))
839 return createSimpleLoad(Builder, ValTy, Ptr, Alignment: EffA.value(), MDSources);
840
841 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vL32b_pred_ai);
842 // FIXME: This may not put the offset from Ptr into the vmem offset.
843 return HVC.createHvxIntrinsic(Builder, IntID: V6_vL32b_pred_ai, RetTy: ValTy,
844 Args: {Predicate, Ptr, HVC.getConstInt(Val: 0)}, ArgTys: {},
845 MDSources);
846}
847
848auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
849 Value *Predicate, int Alignment, Value *Mask,
850 ArrayRef<Value *> MDSources) const -> Value * {
851 if (HVC.isZero(Val: Mask) || HVC.isUndef(Val) || HVC.isUndef(Val: Mask))
852 return UndefValue::get(T: Val->getType());
853 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
854 "Expectning scalar predicate"));
855 if (Predicate) {
856 if (HVC.isFalse(Val: Predicate))
857 return UndefValue::get(T: Val->getType());
858 if (HVC.isTrue(Val: Predicate))
859 Predicate = nullptr;
860 }
861 // Here both Predicate and Mask are true or unknown.
862
863 if (HVC.isTrue(Val: Mask)) {
864 if (Predicate) { // Predicate unknown
865 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
866 MDSources);
867 }
868 // Predicate is true:
869 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
870 }
871
872 // Mask is unknown
873 if (!Predicate) {
874 Instruction *Store =
875 Builder.CreateMaskedStore(Val, Ptr, Alignment: Align(Alignment), Mask);
876 propagateMetadata(I: Store, VL: MDSources);
877 return Store;
878 }
879
880 // Both Predicate and Mask are unknown.
881 // Emulate masked store with predicated-load + mux + predicated-store.
882 Value *PredLoad = createPredicatedLoad(Builder, ValTy: Val->getType(), Ptr,
883 Predicate, Alignment, MDSources);
884 Value *Mux = Builder.CreateSelect(C: Mask, True: Val, False: PredLoad);
885 return createPredicatedStore(Builder, Val: Mux, Ptr, Predicate, Alignment,
886 MDSources);
887}
888
889auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
890 Value *Ptr, int Alignment,
891 ArrayRef<Value *> MDSources) const
892 -> Value * {
893 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy: Val->getType(), Requested: Alignment);
894 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align: EffA);
895 LLVM_DEBUG(dbgs() << "\t[Creating store:] "; Store->dump());
896 propagateMetadata(I: Store, VL: MDSources);
897 return Store;
898}
899
900auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
901 Value *Ptr, Value *Predicate,
902 int Alignment,
903 ArrayRef<Value *> MDSources) const
904 -> Value * {
905 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy: Val->getType(), Requested: Alignment);
906 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
907 "Predicates 'scalar' vector stores not yet supported");
908 assert(Predicate);
909 if (HVC.isFalse(Val: Predicate))
910 return UndefValue::get(T: Val->getType());
911 if (HVC.isTrue(Val: Predicate))
912 return createSimpleStore(Builder, Val, Ptr, Alignment: EffA.value(), MDSources);
913
914 assert(HVC.getSizeOf(Val, HVC.Alloc) % EffA.value() == 0);
915 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vS32b_pred_ai);
916 // FIXME: This may not put the offset from Ptr into the vmem offset.
917 return HVC.createHvxIntrinsic(Builder, IntID: V6_vS32b_pred_ai, RetTy: nullptr,
918 Args: {Predicate, Ptr, HVC.getConstInt(Val: 0), Val}, ArgTys: {},
919 MDSources);
920}
921
922auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
923 -> DepList {
924 BasicBlock *Parent = Base->getParent();
925 assert(In->getParent() == Parent &&
926 "Base and In should be in the same block");
927 assert(Base->comesBefore(In) && "Base should come before In");
928
929 DepList Deps;
930 std::deque<Instruction *> WorkQ = {In};
931 while (!WorkQ.empty()) {
932 Instruction *D = WorkQ.front();
933 WorkQ.pop_front();
934 if (D != In)
935 Deps.insert(x: D);
936 for (Value *Op : D->operands()) {
937 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
938 if (I->getParent() == Parent && Base->comesBefore(Other: I))
939 WorkQ.push_back(x: I);
940 }
941 }
942 }
943 return Deps;
944}
945
946auto AlignVectors::createAddressGroups() -> bool {
947 // An address group created here may contain instructions spanning
948 // multiple basic blocks.
949 AddrList WorkStack;
950
951 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
952 for (AddrInfo &W : WorkStack) {
953 if (auto D = HVC.calculatePointerDifference(Ptr0: AI.Addr, Ptr1: W.Addr))
954 return std::make_pair(x&: W.Inst, y&: *D);
955 }
956 return std::make_pair(x: nullptr, y: 0);
957 };
958
959 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
960 BasicBlock &Block = *DomN->getBlock();
961 for (Instruction &I : Block) {
962 auto AI = this->getAddrInfo(In&: I); // Use this-> for gcc6.
963 if (!AI)
964 continue;
965 auto F = findBaseAndOffset(*AI);
966 Instruction *GroupInst;
967 if (Instruction *BI = F.first) {
968 AI->Offset = F.second;
969 GroupInst = BI;
970 } else {
971 WorkStack.push_back(x: *AI);
972 GroupInst = AI->Inst;
973 }
974 AddrGroups[GroupInst].push_back(x: *AI);
975 }
976
977 for (DomTreeNode *C : DomN->children())
978 Visit(C, Visit);
979
980 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
981 WorkStack.pop_back();
982 };
983
984 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
985 assert(WorkStack.empty());
986
987 // AddrGroups are formed.
988 // Remove groups of size 1.
989 AddrGroups.remove_if(Pred: [](auto &G) { return G.second.size() == 1; });
990 // Remove groups that don't use HVX types.
991 AddrGroups.remove_if(Pred: [&](auto &G) {
992 return llvm::none_of(
993 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(VecTy: I.ValTy); });
994 });
995
996 LLVM_DEBUG(dbgs() << AddrGroups);
997 return !AddrGroups.empty();
998}
999
1000auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
1001 // Form load groups.
1002 // To avoid complications with moving code across basic blocks, only form
1003 // groups that are contained within a single basic block.
1004 unsigned SizeLimit = VAGroupSizeLimit;
1005 if (SizeLimit == 0)
1006 return {};
1007
1008 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1009 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1010 if (Move.Main.size() >= SizeLimit) {
1011 HVC.ORE.emit(RemarkBuilder: [&]() {
1012 return OptimizationRemarkMissed(DEBUG_TYPE, "GroupSizeLimitExceeded",
1013 Info.Inst->getDebugLoc(),
1014 Info.Inst->getParent())
1015 << "alignment group exceeds size limit";
1016 });
1017 return false;
1018 }
1019 // Don't mix HVX and non-HVX instructions.
1020 if (Move.IsHvx != isHvx(AI: Info))
1021 return false;
1022 // Leading instruction in the load group.
1023 Instruction *Base = Move.Main.front();
1024 if (Base->getParent() != Info.Inst->getParent())
1025 return false;
1026 // Check if it's safe to move the load.
1027 if (!HVC.isSafeToMoveBeforeInBB(In: *Info.Inst, To: Base->getIterator())) {
1028 HVC.ORE.emit(RemarkBuilder: [&]() {
1029 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeToRelocate",
1030 Info.Inst->getDebugLoc(),
1031 Info.Inst->getParent())
1032 << "unsafe to relocate memory access for alignment";
1033 });
1034 return false;
1035 }
1036 // And if it's safe to clone the dependencies.
1037 auto isSafeToCopyAtBase = [&](const Instruction *I) {
1038 return HVC.isSafeToMoveBeforeInBB(In: *I, To: Base->getIterator()) &&
1039 HVC.isSafeToClone(In: *I);
1040 };
1041 DepList Deps = getUpwardDeps(In: Info.Inst, Base);
1042 if (!llvm::all_of(Range&: Deps, P: isSafeToCopyAtBase))
1043 return false;
1044
1045 Move.Main.push_back(x: Info.Inst);
1046 llvm::append_range(C&: Move.Deps, R&: Deps);
1047 return true;
1048 };
1049
1050 MoveList LoadGroups;
1051
1052 for (const AddrInfo &Info : Group) {
1053 if (!Info.Inst->mayReadFromMemory())
1054 continue;
1055 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
1056 LoadGroups.emplace_back(args: Info, args: Group.front().Inst, args: isHvx(AI: Info), args: true);
1057 }
1058
1059 // Erase groups smaller than the minimum load group size.
1060 unsigned LoadGroupSizeLimit = MinLoadGroupSizeForAlignment;
1061 erase_if(container&: LoadGroups, p: [LoadGroupSizeLimit](const MoveGroup &G) {
1062 return G.Main.size() < LoadGroupSizeLimit;
1063 });
1064
1065 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1066 if (!HVC.HST.useHVXV62Ops()) {
1067 bool HadHvx =
1068 llvm::any_of(Range&: LoadGroups, P: [](const MoveGroup &G) { return G.IsHvx; });
1069 erase_if(container&: LoadGroups, p: [](const MoveGroup &G) { return G.IsHvx; });
1070 if (HadHvx) {
1071 HVC.ORE.emit(RemarkBuilder: [&]() {
1072 return OptimizationRemarkMissed(DEBUG_TYPE, "HvxVersionTooLow",
1073 HVC.F.getSubprogram(), &HVC.F.front())
1074 << "HVX version too low for predicated load operations";
1075 });
1076 }
1077 }
1078
1079 LLVM_DEBUG(dbgs() << "LoadGroups list: " << LoadGroups);
1080 return LoadGroups;
1081}
1082
1083auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
1084 // Form store groups.
1085 // To avoid complications with moving code across basic blocks, only form
1086 // groups that are contained within a single basic block.
1087 unsigned SizeLimit = VAGroupSizeLimit;
1088 if (SizeLimit == 0)
1089 return {};
1090
1091 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1092 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1093 if (Move.Main.size() >= SizeLimit) {
1094 HVC.ORE.emit(RemarkBuilder: [&]() {
1095 return OptimizationRemarkMissed(DEBUG_TYPE, "GroupSizeLimitExceeded",
1096 Info.Inst->getDebugLoc(),
1097 Info.Inst->getParent())
1098 << "alignment group exceeds size limit";
1099 });
1100 return false;
1101 }
1102 // For stores with return values we'd have to collect downward dependencies.
1103 // There are no such stores that we handle at the moment, so omit that.
1104 assert(Info.Inst->getType()->isVoidTy() &&
1105 "Not handling stores with return values");
1106 // Don't mix HVX and non-HVX instructions.
1107 if (Move.IsHvx != isHvx(AI: Info))
1108 return false;
1109 // For stores we need to be careful whether it's safe to move them.
1110 // Stores that are otherwise safe to move together may not appear safe
1111 // to move over one another (i.e. isSafeToMoveBefore may return false).
1112 Instruction *Base = Move.Main.front();
1113 if (Base->getParent() != Info.Inst->getParent())
1114 return false;
1115 if (!HVC.isSafeToMoveBeforeInBB(In: *Info.Inst, To: Base->getIterator(),
1116 IgnoreInsts: Move.Main)) {
1117 HVC.ORE.emit(RemarkBuilder: [&]() {
1118 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeToRelocate",
1119 Info.Inst->getDebugLoc(),
1120 Info.Inst->getParent())
1121 << "unsafe to relocate memory access for alignment";
1122 });
1123 return false;
1124 }
1125 Move.Main.push_back(x: Info.Inst);
1126 return true;
1127 };
1128
1129 MoveList StoreGroups;
1130
1131 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1132 const AddrInfo &Info = *I;
1133 if (!Info.Inst->mayWriteToMemory())
1134 continue;
1135 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1136 StoreGroups.emplace_back(args: Info, args: Group.front().Inst, args: isHvx(AI: Info), args: false);
1137 }
1138
1139 // Erase singleton groups.
1140 erase_if(container&: StoreGroups, p: [](const MoveGroup &G) { return G.Main.size() <= 1; });
1141
1142 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1143 if (!HVC.HST.useHVXV62Ops()) {
1144 bool HadHvx =
1145 llvm::any_of(Range&: StoreGroups, P: [](const MoveGroup &G) { return G.IsHvx; });
1146 erase_if(container&: StoreGroups, p: [](const MoveGroup &G) { return G.IsHvx; });
1147 if (HadHvx) {
1148 HVC.ORE.emit(RemarkBuilder: [&]() {
1149 return OptimizationRemarkMissed(DEBUG_TYPE, "HvxVersionTooLow",
1150 HVC.F.getSubprogram(), &HVC.F.front())
1151 << "HVX version too low for predicated store operations";
1152 });
1153 }
1154 }
1155
1156 // Erase groups where every store is a full HVX vector. The reason is that
1157 // aligning predicated stores generates complex code that may be less
1158 // efficient than a sequence of unaligned vector stores.
1159 if (!VADoFullStores) {
1160 erase_if(container&: StoreGroups, p: [this](const MoveGroup &G) {
1161 return G.IsHvx && llvm::all_of(Range: G.Main, P: [this](Instruction *S) {
1162 auto MaybeInfo = this->getAddrInfo(In&: *S);
1163 assert(MaybeInfo.has_value());
1164 return HVC.HST.isHVXVectorType(
1165 VecTy: EVT::getEVT(Ty: MaybeInfo->ValTy, HandleUnknown: false));
1166 });
1167 });
1168 }
1169
1170 return StoreGroups;
1171}
1172
1173auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1174 // Move all instructions to be adjacent.
1175 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1176 Instruction *Where = Move.Main.front();
1177
1178 if (Move.IsLoad) {
1179 // Move all the loads (and dependencies) to where the first load is.
1180 // Clone all deps to before Where, keeping order.
1181 Move.Clones = cloneBefore(To: Where->getIterator(), Insts&: Move.Deps);
1182 // Move all main instructions to after Where, keeping order.
1183 ArrayRef<Instruction *> Main(Move.Main);
1184 for (Instruction *M : Main) {
1185 if (M != Where)
1186 M->moveAfter(MovePos: Where);
1187 for (auto [Old, New] : Move.Clones)
1188 M->replaceUsesOfWith(From: Old, To: New);
1189 Where = M;
1190 }
1191 // Replace Deps with the clones.
1192 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1193 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1194 } else {
1195 // Move all the stores to where the last store is.
1196 // NOTE: Deps are empty for "store" groups. If they need to be
1197 // non-empty, decide on the order.
1198 assert(Move.Deps.empty());
1199 // Move all main instructions to before Where, inverting order.
1200 ArrayRef<Instruction *> Main(Move.Main);
1201 for (Instruction *M : Main.drop_front(N: 1)) {
1202 M->moveBefore(InsertPos: Where->getIterator());
1203 Where = M;
1204 }
1205 }
1206
1207 return Move.Main.size() + Move.Deps.size() > 1;
1208}
1209
1210template <typename T>
1211auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1212 -> InstMap {
1213 InstMap Map;
1214
1215 for (Instruction *I : Insts) {
1216 assert(HVC.isSafeToClone(*I));
1217 Instruction *C = I->clone();
1218 C->setName(Twine("c.") + I->getName() + ".");
1219 C->insertBefore(InsertPos: To);
1220
1221 for (auto [Old, New] : Map)
1222 C->replaceUsesOfWith(From: Old, To: New);
1223 Map.insert(KV: std::make_pair(x&: I, y&: C));
1224 }
1225 return Map;
1226}
1227
1228auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1229 const ByteSpan &VSpan, int ScLen,
1230 Value *AlignVal, Value *AlignAddr) const
1231 -> void {
1232 LLVM_DEBUG(dbgs() << __func__ << "\n");
1233
1234 Type *SecTy = HVC.getByteTy(ElemCount: ScLen);
1235 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1236 bool DoAlign = !HVC.isZero(Val: AlignVal);
1237 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1238 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1239
1240 ByteSpan ASpan;
1241 auto *True = Constant::getAllOnesValue(Ty: HVC.getBoolTy(ElemCount: ScLen));
1242 auto *Undef = UndefValue::get(T: SecTy);
1243
1244 // Created load does not have to be "Instruction" (e.g. "undef").
1245 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1246
1247 // We could create all of the aligned loads, and generate the valigns
1248 // at the location of the first load, but for large load groups, this
1249 // could create highly suboptimal code (there have been groups of 140+
1250 // loads in real code).
1251 // Instead, place the loads/valigns as close to the users as possible.
1252 // In any case we need to have a mapping from the blocks of VSpan (the
1253 // span covered by the pre-existing loads) to ASpan (the span covered
1254 // by the aligned loads). There is a small problem, though: ASpan needs
1255 // to have pointers to the loads/valigns, but we don't have these loads
1256 // because we don't know where to put them yet. We find out by creating
1257 // a section of ASpan that corresponds to values (blocks) from VSpan,
1258 // and checking where the new load should be placed. We need to attach
1259 // this location information to each block in ASpan somehow, so we put
1260 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1261 // to store the location for each Seg.Val.
1262 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1263 // which helps with printing ByteSpans without crashing when printing
1264 // Segments with these temporary identifiers in place of Val.
1265
1266 // Populate the blocks first, to avoid reallocations of the vector
1267 // interfering with generating the placeholder addresses.
1268 for (int Index = 0; Index != NumSectors; ++Index)
1269 ASpan.Blocks.emplace_back(args: nullptr, args&: ScLen, args: Index * ScLen);
1270 for (int Index = 0; Index != NumSectors; ++Index) {
1271 ASpan.Blocks[Index].Seg.Val =
1272 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1273 }
1274
1275 // Multiple values from VSpan can map to the same value in ASpan. Since we
1276 // try to create loads lazily, we need to find the earliest use for each
1277 // value from ASpan.
1278 DenseMap<void *, Instruction *> EarliestUser;
1279 auto isEarlier = [](Instruction *A, Instruction *B) {
1280 if (B == nullptr)
1281 return true;
1282 if (A == nullptr)
1283 return false;
1284 assert(A->getParent() == B->getParent());
1285 return A->comesBefore(Other: B);
1286 };
1287 auto earliestUser = [&](const auto &Uses) {
1288 Instruction *User = nullptr;
1289 for (const Use &U : Uses) {
1290 auto *I = dyn_cast<Instruction>(Val: U.getUser());
1291 assert(I != nullptr && "Load used in a non-instruction?");
1292 // Make sure we only consider users in this block, but we need
1293 // to remember if there were users outside the block too. This is
1294 // because if no users are found, aligned loads will not be created.
1295 if (I->getParent() == BaseBlock) {
1296 if (!isa<PHINode>(Val: I))
1297 User = std::min(a: User, b: I, comp: isEarlier);
1298 } else {
1299 User = std::min(a: User, b: BaseBlock->getTerminator(), comp: isEarlier);
1300 }
1301 }
1302 return User;
1303 };
1304
1305 for (const ByteSpan::Block &B : VSpan) {
1306 ByteSpan ASection = ASpan.section(Start: B.Pos, Length: B.Seg.Size);
1307 for (const ByteSpan::Block &S : ASection) {
1308 auto &EU = EarliestUser[S.Seg.Val];
1309 EU = std::min(a: EU, b: earliestUser(B.Seg.Val->uses()), comp: isEarlier);
1310 }
1311 }
1312
1313 LLVM_DEBUG({
1314 dbgs() << "ASpan:\n" << ASpan << '\n';
1315 dbgs() << "Earliest users of ASpan:\n";
1316 for (auto &[Val, User] : EarliestUser) {
1317 dbgs() << Val << "\n ->" << *User << '\n';
1318 }
1319 });
1320
1321 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1322 int Index, bool MakePred) {
1323 Value *Ptr =
1324 createAdjustedPointer(Builder, Ptr: AlignAddr, ValTy: SecTy, Adjust: Index * ScLen);
1325 Value *Predicate =
1326 MakePred ? makeTestIfUnaligned(Builder, AlignVal, Alignment: ScLen) : nullptr;
1327
1328 // If vector shifting is potentially needed, accumulate metadata
1329 // from source sections of twice the load width.
1330 int Start = (Index - DoAlign) * ScLen;
1331 int Width = (1 + DoAlign) * ScLen;
1332 return this->createLoad(Builder, ValTy: SecTy, Ptr, Predicate, Alignment: ScLen, Mask: True, PassThru: Undef,
1333 MDSources: VSpan.section(Start, Length: Width).values());
1334 };
1335
1336 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1337 // Move In and its upward dependencies to before To.
1338 assert(In->getParent() == To->getParent());
1339 DepList Deps = getUpwardDeps(In: &*In, Base: &*To);
1340 In->moveBefore(InsertPos: To);
1341 // DepList is sorted with respect to positions in the basic block.
1342 InstMap Map = cloneBefore(To: In, Insts&: Deps);
1343 for (auto [Old, New] : Map)
1344 In->replaceUsesOfWith(From: Old, To: New);
1345 };
1346
1347 // Generate necessary loads at appropriate locations.
1348 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1349 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1350 // In ASpan, each block will be either a single aligned load, or a
1351 // valign of a pair of loads. In the latter case, an aligned load j
1352 // will belong to the current valign, and the one in the previous
1353 // block (for j > 0).
1354 // Place the load at a location which will dominate the valign, assuming
1355 // the valign will be placed right before the earliest user.
1356 Instruction *PrevAt =
1357 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1358 Instruction *ThisAt =
1359 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1360 if (auto *Where = std::min(a: PrevAt, b: ThisAt, comp: isEarlier)) {
1361 Builder.SetInsertPoint(Where);
1362 Loads[Index] =
1363 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1364 // We know it's safe to put the load at BasePos, but we'd prefer to put
1365 // it at "Where". To see if the load is safe to be placed at Where, put
1366 // it there first and then check if it's safe to move it to BasePos.
1367 // If not, then the load needs to be placed at BasePos.
1368 // We can't do this check proactively because we need the load to exist
1369 // in order to check legality.
1370 if (auto *Load = dyn_cast<Instruction>(Val: Loads[Index])) {
1371 if (!HVC.isSafeToMoveBeforeInBB(In: *Load, To: BasePos))
1372 moveBefore(Load->getIterator(), BasePos);
1373 }
1374 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1375 }
1376 }
1377
1378 // Generate valigns if needed, and fill in proper values in ASpan
1379 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1380 for (int Index = 0; Index != NumSectors; ++Index) {
1381 ASpan[Index].Seg.Val = nullptr;
1382 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1383 Builder.SetInsertPoint(Where);
1384 Value *Val = Loads[Index];
1385 assert(Val != nullptr);
1386 if (DoAlign) {
1387 Value *NextLoad = Loads[Index + 1];
1388 assert(NextLoad != nullptr);
1389 Val = HVC.vralignb(Builder, Lo: Val, Hi: NextLoad, Amt: AlignVal);
1390 }
1391 ASpan[Index].Seg.Val = Val;
1392 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1393 }
1394 }
1395
1396 for (const ByteSpan::Block &B : VSpan) {
1397 ByteSpan ASection = ASpan.section(Start: B.Pos, Length: B.Seg.Size).shift(Offset: -B.Pos);
1398 Value *Accum = UndefValue::get(T: HVC.getByteTy(ElemCount: B.Seg.Size));
1399 Builder.SetInsertPoint(cast<Instruction>(Val: B.Seg.Val));
1400
1401 // We're generating a reduction, where each instruction depends on
1402 // the previous one, so we need to order them according to the position
1403 // of their inputs in the code.
1404 std::vector<ByteSpan::Block *> ABlocks;
1405 for (ByteSpan::Block &S : ASection) {
1406 if (S.Seg.Val != nullptr)
1407 ABlocks.push_back(x: &S);
1408 }
1409 llvm::sort(C&: ABlocks,
1410 Comp: [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1411 return isEarlier(cast<Instruction>(Val: A->Seg.Val),
1412 cast<Instruction>(Val: B->Seg.Val));
1413 });
1414 for (ByteSpan::Block *S : ABlocks) {
1415 // The processing of the data loaded by the aligned loads
1416 // needs to be inserted after the data is available.
1417 Instruction *SegI = cast<Instruction>(Val: S->Seg.Val);
1418 Builder.SetInsertPoint(&*std::next(x: SegI->getIterator()));
1419 Value *Pay = HVC.vbytes(Builder, Val: getPayload(Val: S->Seg.Val));
1420 Accum =
1421 HVC.insertb(Builder, Dest: Accum, Src: Pay, Start: S->Seg.Start, Length: S->Seg.Size, Where: S->Pos);
1422 }
1423 // Instead of casting everything to bytes for the vselect, cast to the
1424 // original value type. This will avoid complications with casting masks.
1425 // For example, in cases when the original mask applied to i32, it could
1426 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1427 // but if the mask is not exactly of HVX length, extra handling would be
1428 // needed to make it work.
1429 Type *ValTy = getPayload(Val: B.Seg.Val)->getType();
1430 Value *Cast = Builder.CreateBitCast(V: Accum, DestTy: ValTy, Name: "cst");
1431 Value *Sel = Builder.CreateSelect(C: getMask(Val: B.Seg.Val), True: Cast,
1432 False: getPassThrough(Val: B.Seg.Val), Name: "sel");
1433 B.Seg.Val->replaceAllUsesWith(V: Sel);
1434 }
1435}
1436
1437auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1438 const ByteSpan &VSpan, int ScLen,
1439 Value *AlignVal, Value *AlignAddr) const
1440 -> void {
1441 LLVM_DEBUG(dbgs() << __func__ << "\n");
1442
1443 Type *SecTy = HVC.getByteTy(ElemCount: ScLen);
1444 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1445 bool DoAlign = !HVC.isZero(Val: AlignVal);
1446
1447 // Stores.
1448 ByteSpan ASpanV, ASpanM;
1449
1450 // Return a vector value corresponding to the input value Val:
1451 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1452 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1453 Type *Ty = Val->getType();
1454 if (Ty->isVectorTy())
1455 return Val;
1456 auto *VecTy = VectorType::get(ElementType: Ty, NumElements: 1, /*Scalable=*/false);
1457 return Builder.CreateBitCast(V: Val, DestTy: VecTy, Name: "cst");
1458 };
1459
1460 // Create an extra "undef" sector at the beginning and at the end.
1461 // They will be used as the left/right filler in the vlalign step.
1462 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1463 // For stores, the size of each section is an aligned vector length.
1464 // Adjust the store offsets relative to the section start offset.
1465 ByteSpan VSection =
1466 VSpan.section(Start: Index * ScLen, Length: ScLen).shift(Offset: -Index * ScLen);
1467 Value *Undef = UndefValue::get(T: SecTy);
1468 Value *Zero = Constant::getNullValue(Ty: SecTy);
1469 Value *AccumV = Undef;
1470 Value *AccumM = Zero;
1471 for (ByteSpan::Block &S : VSection) {
1472 Value *Pay = getPayload(Val: S.Seg.Val);
1473 Value *Mask = HVC.rescale(Builder, Mask: MakeVec(Builder, getMask(Val: S.Seg.Val)),
1474 FromTy: Pay->getType(), ToTy: HVC.getByteTy());
1475 Value *PartM = HVC.insertb(Builder, Dest: Zero, Src: HVC.vbytes(Builder, Val: Mask),
1476 Start: S.Seg.Start, Length: S.Seg.Size, Where: S.Pos);
1477 AccumM = Builder.CreateOr(LHS: AccumM, RHS: PartM);
1478
1479 Value *PartV = HVC.insertb(Builder, Dest: Undef, Src: HVC.vbytes(Builder, Val: Pay),
1480 Start: S.Seg.Start, Length: S.Seg.Size, Where: S.Pos);
1481
1482 AccumV = Builder.CreateSelect(
1483 C: Builder.CreateICmp(P: CmpInst::ICMP_NE, LHS: PartM, RHS: Zero), True: PartV, False: AccumV);
1484 }
1485 ASpanV.Blocks.emplace_back(args&: AccumV, args&: ScLen, args: Index * ScLen);
1486 ASpanM.Blocks.emplace_back(args&: AccumM, args&: ScLen, args: Index * ScLen);
1487 }
1488
1489 LLVM_DEBUG({
1490 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1491 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1492 });
1493
1494 // vlalign
1495 if (DoAlign) {
1496 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1497 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1498 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1499 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1500 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, Lo: PrevV, Hi: ThisV, Amt: AlignVal);
1501 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, Lo: PrevM, Hi: ThisM, Amt: AlignVal);
1502 }
1503 }
1504
1505 LLVM_DEBUG({
1506 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1507 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1508 });
1509
1510 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1511 const ByteSpan &ASpanM, int Index, bool MakePred) {
1512 Value *Val = ASpanV[Index].Seg.Val;
1513 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1514 if (HVC.isUndef(Val) || HVC.isZero(Val: Mask))
1515 return;
1516 Value *Ptr =
1517 createAdjustedPointer(Builder, Ptr: AlignAddr, ValTy: SecTy, Adjust: Index * ScLen);
1518 Value *Predicate =
1519 MakePred ? makeTestIfUnaligned(Builder, AlignVal, Alignment: ScLen) : nullptr;
1520
1521 // If vector shifting is potentially needed, accumulate metadata
1522 // from source sections of twice the store width.
1523 int Start = (Index - DoAlign) * ScLen;
1524 int Width = (1 + DoAlign) * ScLen;
1525 this->createStore(Builder, Val, Ptr, Predicate, Alignment: ScLen,
1526 Mask: HVC.vlsb(Builder, Val: Mask),
1527 MDSources: VSpan.section(Start, Length: Width).values());
1528 };
1529
1530 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1531 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1532 }
1533}
1534
1535auto AlignVectors::realignGroup(const MoveGroup &Move) -> bool {
1536 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1537
1538 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1539 if (!Move.IsHvx)
1540 return false;
1541
1542 // Return the element with the maximum alignment from Range,
1543 // where GetValue obtains the value to compare from an element.
1544 auto getMaxOf = [](auto Range, auto GetValue) {
1545 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1546 return GetValue(A) < GetValue(B);
1547 });
1548 };
1549
1550 AddrList &BaseInfos = AddrGroups[Move.Base];
1551
1552 // Conceptually, there is a vector of N bytes covering the addresses
1553 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1554 // represents a contiguous memory region that spans all accessed memory
1555 // locations.
1556 // The correspondence between loaded or stored values will be expressed
1557 // in terms of this vector. For example, the 0th element of the vector
1558 // from the Base address info will start at byte Start from the beginning
1559 // of this conceptual vector.
1560 //
1561 // This vector will be loaded/stored starting at the nearest down-aligned
1562 // address and the amount of the down-alignment will be AlignVal:
1563 // valign(load_vector(align_down(Base+Start)), AlignVal)
1564
1565 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1566 AddrList MoveInfos;
1567
1568 llvm::copy_if(
1569 Range&: BaseInfos, Out: std::back_inserter(x&: MoveInfos),
1570 P: [&TestSet](const AddrInfo &AI) { return TestSet.count(x: AI.Inst); });
1571
1572 // Maximum alignment present in the whole address group.
1573 const AddrInfo &WithMaxAlign =
1574 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1575 Align MaxGiven = WithMaxAlign.HaveAlign;
1576
1577 // Minimum alignment present in the move address group.
1578 const AddrInfo &WithMinOffset =
1579 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1580
1581 const AddrInfo &WithMaxNeeded =
1582 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1583 Align MinNeeded = WithMaxNeeded.NeedAlign;
1584
1585 // Set the builder's insertion point right before the load group, or
1586 // immediately after the store group. (Instructions in a store group are
1587 // listed in reverse order.)
1588 Instruction *InsertAt = Move.Main.front();
1589 if (!Move.IsLoad) {
1590 // There should be a terminator (which store isn't, but check anyways).
1591 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1592 InsertAt = &*std::next(x: InsertAt->getIterator());
1593 }
1594
1595 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1596 InstSimplifyFolder(HVC.DL));
1597 Value *AlignAddr = nullptr; // Actual aligned address.
1598 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1599
1600 if (MinNeeded <= MaxGiven) {
1601 int Start = WithMinOffset.Offset;
1602 int OffAtMax = WithMaxAlign.Offset;
1603 // Shift the offset of the maximally aligned instruction (OffAtMax)
1604 // back by just enough multiples of the required alignment to cover the
1605 // distance from Start to OffAtMax.
1606 // Calculate the address adjustment amount based on the address with the
1607 // maximum alignment. This is to allow a simple gep instruction instead
1608 // of potential bitcasts to i8*.
1609 int Adjust = -alignTo(Value: OffAtMax - Start, Align: MinNeeded.value());
1610 AlignAddr = createAdjustedPointer(Builder, Ptr: WithMaxAlign.Addr,
1611 ValTy: WithMaxAlign.ValTy, Adjust, CloneMap: Move.Clones);
1612 int Diff = Start - (OffAtMax + Adjust);
1613 AlignVal = HVC.getConstInt(Val: Diff);
1614 assert(Diff >= 0);
1615 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1616 } else {
1617 // WithMinOffset is the lowest address in the group,
1618 // WithMinOffset.Addr = Base+Start.
1619 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1620 // mask off unnecessary bits, so it's ok to just the original pointer as
1621 // the alignment amount.
1622 // Do an explicit down-alignment of the address to avoid creating an
1623 // aligned instruction with an address that is not really aligned.
1624 AlignAddr =
1625 createAlignedPointer(Builder, Ptr: WithMinOffset.Addr, ValTy: WithMinOffset.ValTy,
1626 Alignment: MinNeeded.value(), CloneMap: Move.Clones);
1627 AlignVal =
1628 Builder.CreatePtrToInt(V: WithMinOffset.Addr, DestTy: HVC.getIntTy(), Name: "pti");
1629 if (auto *I = dyn_cast<Instruction>(Val: AlignVal)) {
1630 for (auto [Old, New] : Move.Clones)
1631 I->replaceUsesOfWith(From: Old, To: New);
1632 }
1633 }
1634
1635 ByteSpan VSpan;
1636 for (const AddrInfo &AI : MoveInfos) {
1637 VSpan.Blocks.emplace_back(args: AI.Inst, args: HVC.getSizeOf(Ty: AI.ValTy),
1638 args: AI.Offset - WithMinOffset.Offset);
1639 }
1640
1641 // The aligned loads/stores will use blocks that are either scalars,
1642 // or HVX vectors. Let "sector" be the unified term for such a block.
1643 // blend(scalar, vector) -> sector...
1644 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1645 : std::max<int>(a: MinNeeded.value(), b: 4);
1646 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1647 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1648
1649 LLVM_DEBUG({
1650 dbgs() << "ScLen: " << ScLen << "\n";
1651 dbgs() << "AlignVal:" << *AlignVal << "\n";
1652 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1653 dbgs() << "VSpan:\n" << VSpan << '\n';
1654 });
1655
1656 if (Move.IsLoad)
1657 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1658 else
1659 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1660
1661 Instruction *Front = Move.Main.front();
1662 HVC.ORE.emit(RemarkBuilder: [&]() {
1663 return OptimizationRemark(DEBUG_TYPE, "VectorsAligned",
1664 Front->getDebugLoc(), Front->getParent())
1665 << "aligned vector memory operations";
1666 });
1667
1668 for (auto *Inst : Move.Main)
1669 Inst->eraseFromParent();
1670
1671 return true;
1672}
1673
1674auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1675 int Alignment) const -> Value * {
1676 auto *AlignTy = AlignVal->getType();
1677 Value *And = Builder.CreateAnd(
1678 LHS: AlignVal, RHS: ConstantInt::get(Ty: AlignTy, V: Alignment - 1), Name: "and");
1679 Value *Zero = ConstantInt::get(Ty: AlignTy, V: 0);
1680 return Builder.CreateICmpNE(LHS: And, RHS: Zero, Name: "isz");
1681}
1682
1683auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1684 if (!HVC.isByteVecTy(Ty))
1685 return false;
1686 int Size = HVC.getSizeOf(Ty);
1687 if (HVC.HST.isTypeForHVX(VecTy: Ty))
1688 return Size == static_cast<int>(HVC.HST.getVectorLength());
1689 return Size == 4 || Size == 8;
1690}
1691
1692auto AlignVectors::run() -> bool {
1693 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1694 << '\n');
1695 if (!createAddressGroups())
1696 return false;
1697
1698 LLVM_DEBUG({
1699 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1700 for (auto &[In, AL] : AddrGroups) {
1701 for (const AddrInfo &AI : AL)
1702 dbgs() << "---\n" << AI << '\n';
1703 }
1704 });
1705
1706 bool Changed = false;
1707 MoveList LoadGroups, StoreGroups;
1708
1709 for (auto &G : AddrGroups) {
1710 llvm::append_range(C&: LoadGroups, R: createLoadGroups(Group: G.second));
1711 llvm::append_range(C&: StoreGroups, R: createStoreGroups(Group: G.second));
1712 }
1713
1714 LLVM_DEBUG({
1715 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1716 for (const MoveGroup &G : LoadGroups)
1717 dbgs() << G << "\n";
1718 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1719 for (const MoveGroup &G : StoreGroups)
1720 dbgs() << G << "\n";
1721 });
1722
1723 // Cumulative limit on the number of groups.
1724 unsigned CountLimit = VAGroupCountLimit;
1725 if (CountLimit == 0)
1726 return false;
1727
1728 if (LoadGroups.size() > CountLimit) {
1729 LoadGroups.resize(new_size: CountLimit);
1730 StoreGroups.clear();
1731 } else {
1732 unsigned StoreLimit = CountLimit - LoadGroups.size();
1733 if (StoreGroups.size() > StoreLimit)
1734 StoreGroups.resize(new_size: StoreLimit);
1735 }
1736
1737 for (auto &M : LoadGroups)
1738 Changed |= moveTogether(Move&: M);
1739 for (auto &M : StoreGroups)
1740 Changed |= moveTogether(Move&: M);
1741
1742 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1743
1744 for (auto &M : LoadGroups)
1745 Changed |= realignGroup(Move: M);
1746 for (auto &M : StoreGroups)
1747 Changed |= realignGroup(Move: M);
1748
1749 return Changed;
1750}
1751
1752// --- End AlignVectors
1753
1754// --- Begin HvxIdioms
1755
1756auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1757 -> std::pair<unsigned, Signedness> {
1758 unsigned Bits = HVC.getNumSignificantBits(V, CtxI: In);
1759 // The significant bits are calculated including the sign bit. This may
1760 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1761 // result in 33 significant bits. To avoid extra words, skip the extra
1762 // sign bit, but keep information that the value is to be treated as
1763 // unsigned.
1764 KnownBits Known = HVC.getKnownBits(V, CtxI: In);
1765 Signedness Sign = Signed;
1766 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1767 if (isPowerOf2_32(Value: Bits))
1768 NumToTest = Bits;
1769 else if (Bits > 1 && isPowerOf2_32(Value: Bits - 1))
1770 NumToTest = Bits - 1;
1771
1772 if (NumToTest != 0 && Known.Zero.ashr(ShiftAmt: NumToTest).isAllOnes()) {
1773 Sign = Unsigned;
1774 Bits = NumToTest;
1775 }
1776
1777 // If the top bit of the nearest power-of-2 is zero, this value is
1778 // positive. It could be treated as either signed or unsigned.
1779 if (unsigned Pow2 = PowerOf2Ceil(A: Bits); Pow2 != Bits) {
1780 if (Known.Zero.ashr(ShiftAmt: Pow2 - 1).isAllOnes())
1781 Sign = Positive;
1782 }
1783 return {Bits, Sign};
1784}
1785
1786auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1787 -> std::pair<SValue, SValue> {
1788 // Canonicalize the signedness of X and Y, so that the result is one of:
1789 // S, S
1790 // U/P, S
1791 // U/P, U/P
1792 if (X.Sgn == Signed && Y.Sgn != Signed)
1793 std::swap(a&: X, b&: Y);
1794 return {X, Y};
1795}
1796
1797// Match
1798// (X * Y) [>> N], or
1799// ((X * Y) + (1 << M)) >> N
1800auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1801 using namespace PatternMatch;
1802 auto *Ty = In.getType();
1803
1804 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1805 return std::nullopt;
1806
1807 unsigned Width = cast<IntegerType>(Val: Ty->getScalarType())->getBitWidth();
1808
1809 FxpOp Op;
1810 Value *Exp = &In;
1811
1812 // Fixed-point multiplication is always shifted right (except when the
1813 // fraction is 0 bits).
1814 auto m_Shr = [](auto &&V, auto &&S) {
1815 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1816 };
1817
1818 uint64_t Qn = 0;
1819 if (Value *T; match(V: Exp, P: m_Shr(m_Value(V&: T), m_ConstantInt(V&: Qn)))) {
1820 Op.Frac = Qn;
1821 Exp = T;
1822 } else {
1823 Op.Frac = 0;
1824 }
1825
1826 if (Op.Frac > Width)
1827 return std::nullopt;
1828
1829 // Check if there is rounding added.
1830 uint64_t CV;
1831 if (Value *T;
1832 Op.Frac > 0 && match(V: Exp, P: m_Add(L: m_Value(V&: T), R: m_ConstantInt(V&: CV)))) {
1833 if (CV != 0 && !isPowerOf2_64(Value: CV))
1834 return std::nullopt;
1835 if (CV != 0)
1836 Op.RoundAt = Log2_64(Value: CV);
1837 Exp = T;
1838 }
1839
1840 // Check if the rest is a multiplication.
1841 if (match(V: Exp, P: m_Mul(L: m_Value(V&: Op.X.Val), R: m_Value(V&: Op.Y.Val)))) {
1842 Op.Opcode = Instruction::Mul;
1843 // FIXME: The information below is recomputed.
1844 Op.X.Sgn = getNumSignificantBits(V: Op.X.Val, In: &In).second;
1845 Op.Y.Sgn = getNumSignificantBits(V: Op.Y.Val, In: &In).second;
1846 Op.ResTy = cast<VectorType>(Val: Ty);
1847 return Op;
1848 }
1849
1850 return std::nullopt;
1851}
1852
1853auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1854 -> Value * {
1855 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1856
1857 auto *VecTy = dyn_cast<VectorType>(Val: Op.X.Val->getType());
1858 if (VecTy == nullptr)
1859 return nullptr;
1860 auto *ElemTy = cast<IntegerType>(Val: VecTy->getElementType());
1861 unsigned ElemWidth = ElemTy->getBitWidth();
1862
1863 // TODO: This can be relaxed after legalization is done pre-isel.
1864 if ((HVC.length(Ty: VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1865 return nullptr;
1866
1867 // There are no special intrinsics that should be used for multiplying
1868 // signed 8-bit values, so just skip them. Normal codegen should handle
1869 // this just fine.
1870 if (ElemWidth <= 8)
1871 return nullptr;
1872 // Similarly, if this is just a multiplication that can be handled without
1873 // intervention, then leave it alone.
1874 if (ElemWidth <= 32 && Op.Frac == 0)
1875 return nullptr;
1876
1877 auto [BitsX, SignX] = getNumSignificantBits(V: Op.X.Val, In: &In);
1878 auto [BitsY, SignY] = getNumSignificantBits(V: Op.Y.Val, In: &In);
1879
1880 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1881
1882 Value *X = Op.X.Val, *Y = Op.Y.Val;
1883 IRBuilder Builder(In.getParent(), In.getIterator(),
1884 InstSimplifyFolder(HVC.DL));
1885
1886 auto roundUpWidth = [](unsigned Width) -> unsigned {
1887 if (Width <= 32 && !isPowerOf2_32(Value: Width)) {
1888 // If the element width is not a power of 2, round it up
1889 // to the next one. Do this for widths not exceeding 32.
1890 return PowerOf2Ceil(A: Width);
1891 }
1892 if (Width > 32 && Width % 32 != 0) {
1893 // For wider elements, round it up to the multiple of 32.
1894 return alignTo(Value: Width, Align: 32u);
1895 }
1896 return Width;
1897 };
1898
1899 BitsX = roundUpWidth(BitsX);
1900 BitsY = roundUpWidth(BitsY);
1901
1902 // For elementwise multiplication vectors must have the same lengths, so
1903 // resize the elements of both inputs to the same width, the max of the
1904 // calculated significant bits.
1905 unsigned Width = std::max(a: BitsX, b: BitsY);
1906
1907 auto *ResizeTy = VectorType::get(ElementType: HVC.getIntTy(Width), Other: VecTy);
1908 if (Width < ElemWidth) {
1909 X = Builder.CreateTrunc(V: X, DestTy: ResizeTy, Name: "trn");
1910 Y = Builder.CreateTrunc(V: Y, DestTy: ResizeTy, Name: "trn");
1911 } else if (Width > ElemWidth) {
1912 X = SignX == Signed ? Builder.CreateSExt(V: X, DestTy: ResizeTy, Name: "sxt")
1913 : Builder.CreateZExt(V: X, DestTy: ResizeTy, Name: "zxt");
1914 Y = SignY == Signed ? Builder.CreateSExt(V: Y, DestTy: ResizeTy, Name: "sxt")
1915 : Builder.CreateZExt(V: Y, DestTy: ResizeTy, Name: "zxt");
1916 };
1917
1918 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1919
1920 unsigned VecLen = HVC.length(Ty: ResizeTy);
1921 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(a: Width, b: 32u);
1922
1923 SmallVector<Value *> Results;
1924 FxpOp ChopOp = Op;
1925 ChopOp.ResTy = VectorType::get(ElementType: Op.ResTy->getElementType(), NumElements: ChopLen, Scalable: false);
1926
1927 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1928 ChopOp.X.Val = HVC.subvector(Builder, Val: X, Start: V * ChopLen, Length: ChopLen);
1929 ChopOp.Y.Val = HVC.subvector(Builder, Val: Y, Start: V * ChopLen, Length: ChopLen);
1930 Results.push_back(Elt: processFxpMulChopped(Builder, In, Op: ChopOp));
1931 if (Results.back() == nullptr)
1932 break;
1933 }
1934
1935 if (Results.empty() || Results.back() == nullptr)
1936 return nullptr;
1937
1938 Value *Cat = HVC.concat(Builder, Vecs: Results);
1939 Value *Ext = SignX == Signed || SignY == Signed
1940 ? Builder.CreateSExt(V: Cat, DestTy: VecTy, Name: "sxt")
1941 : Builder.CreateZExt(V: Cat, DestTy: VecTy, Name: "zxt");
1942 return Ext;
1943}
1944
1945inline bool HvxIdioms::matchScatter(Instruction &In) const {
1946 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1947 if (!II)
1948 return false;
1949 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1950}
1951
1952inline bool HvxIdioms::matchGather(Instruction &In) const {
1953 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1954 if (!II)
1955 return false;
1956 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1957}
1958
1959inline bool HvxIdioms::matchMLoad(Instruction &In) const {
1960 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1961 if (!II)
1962 return false;
1963 return (II->getIntrinsicID() == Intrinsic::masked_load);
1964}
1965
1966inline bool HvxIdioms::matchMStore(Instruction &In) const {
1967 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1968 if (!II)
1969 return false;
1970 return (II->getIntrinsicID() == Intrinsic::masked_store);
1971}
1972
1973Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1974
1975// Binary instructions we want to handle as users of gather/scatter.
1976inline bool isArithmetic(unsigned Opc) {
1977 switch (Opc) {
1978 case Instruction::Add:
1979 case Instruction::Sub:
1980 case Instruction::Mul:
1981 case Instruction::And:
1982 case Instruction::Or:
1983 case Instruction::Xor:
1984 case Instruction::AShr:
1985 case Instruction::LShr:
1986 case Instruction::Shl:
1987 case Instruction::UDiv:
1988 return true;
1989 }
1990 return false;
1991}
1992
1993// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1994inline Value *getPointer(Value *Ptr) {
1995 assert(Ptr && "Unable to extract pointer");
1996 if (isa<AllocaInst>(Val: Ptr) || isa<Argument>(Val: Ptr) || isa<GlobalValue>(Val: Ptr))
1997 return Ptr;
1998 if (isa<LoadInst>(Val: Ptr) || isa<StoreInst>(Val: Ptr))
1999 return getLoadStorePointerOperand(V: Ptr);
2000 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Ptr)) {
2001 if (II->getIntrinsicID() == Intrinsic::masked_store)
2002 return II->getOperand(i_nocapture: 1);
2003 }
2004 return nullptr;
2005}
2006
2007static Instruction *selectDestination(Instruction *In,
2008 HvxIdioms::DstQualifier &Qual) {
2009 Instruction *Destination = nullptr;
2010 if (!In)
2011 return Destination;
2012 if (isa<StoreInst>(Val: In)) {
2013 Destination = In;
2014 Qual = HvxIdioms::LdSt;
2015 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: In)) {
2016 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
2017 Destination = In;
2018 Qual = HvxIdioms::LLVM_Gather;
2019 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
2020 Destination = In;
2021 Qual = HvxIdioms::LLVM_Scatter;
2022 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
2023 Destination = In;
2024 Qual = HvxIdioms::LdSt;
2025 } else if (II->getIntrinsicID() ==
2026 Intrinsic::hexagon_V6_vgather_vscattermh) {
2027 Destination = In;
2028 Qual = HvxIdioms::HEX_Gather_Scatter;
2029 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
2030 Destination = In;
2031 Qual = HvxIdioms::HEX_Scatter;
2032 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
2033 Destination = In;
2034 Qual = HvxIdioms::HEX_Gather;
2035 }
2036 } else if (isa<ZExtInst>(Val: In)) {
2037 return locateDestination(In, Qual);
2038 } else if (isa<CastInst>(Val: In)) {
2039 return locateDestination(In, Qual);
2040 } else if (isa<CallInst>(Val: In)) {
2041 Destination = In;
2042 Qual = HvxIdioms::Call;
2043 } else if (isa<GetElementPtrInst>(Val: In)) {
2044 return locateDestination(In, Qual);
2045 } else if (isArithmetic(Opc: In->getOpcode())) {
2046 Destination = In;
2047 Qual = HvxIdioms::Arithmetic;
2048 } else {
2049 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
2050 }
2051 return Destination;
2052}
2053
2054// This method attempts to find destination (user) for a given intrinsic.
2055// Given that these are produced only by Ripple, the number of options is
2056// limited. Simplest case is explicit store which in fact is redundant (since
2057// HVX gater creates its own store during packetization). Nevertheless we need
2058// to figure address where we storing. Other cases are more complicated, but
2059// still few.
2060Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
2061 Instruction *Destination = nullptr;
2062 if (!In)
2063 return Destination;
2064 // Get all possible destinations
2065 SmallVector<Instruction *> Users;
2066 // Iterate over the uses of the instruction
2067 for (auto &U : In->uses()) {
2068 if (auto *UI = dyn_cast<Instruction>(Val: U.getUser())) {
2069 Destination = selectDestination(In: UI, Qual);
2070 if (Destination)
2071 Users.push_back(Elt: Destination);
2072 }
2073 }
2074 // Now see which of the users (if any) is a memory destination.
2075 for (auto *I : Users)
2076 if (getPointer(Ptr: I))
2077 return I;
2078 return Destination;
2079}
2080
2081// The two intrinsics we handle here have GEP in a different position.
2082inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {
2083 assert(In && "Bad instruction");
2084 IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(Val: In);
2085 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
2086 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
2087 "Not a gather Intrinsic");
2088 GetElementPtrInst *GEPIndex = nullptr;
2089 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
2090 GEPIndex = dyn_cast<GetElementPtrInst>(Val: IIn->getOperand(i_nocapture: 0));
2091 else
2092 GEPIndex = dyn_cast<GetElementPtrInst>(Val: IIn->getOperand(i_nocapture: 1));
2093 return GEPIndex;
2094}
2095
2096// Given the intrinsic find its GEP argument and extract base address it uses.
2097// The method relies on the way how Ripple typically forms the GEP for
2098// scatter/gather.
2099static Value *locateAddressFromIntrinsic(Instruction *In) {
2100 GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
2101 if (!GEPIndex) {
2102 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2103 return nullptr;
2104 }
2105 Value *BaseAddress = GEPIndex->getPointerOperand();
2106 auto *IndexLoad = dyn_cast<LoadInst>(Val: BaseAddress);
2107 if (IndexLoad)
2108 return IndexLoad;
2109
2110 auto *IndexZEx = dyn_cast<ZExtInst>(Val: BaseAddress);
2111 if (IndexZEx) {
2112 IndexLoad = dyn_cast<LoadInst>(Val: IndexZEx->getOperand(i_nocapture: 0));
2113 if (IndexLoad)
2114 return IndexLoad;
2115 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: IndexZEx->getOperand(i_nocapture: 0));
2116 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
2117 return locateAddressFromIntrinsic(In: II);
2118 }
2119 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(Val: BaseAddress);
2120 if (BaseShuffle) {
2121 IndexLoad = dyn_cast<LoadInst>(Val: BaseShuffle->getOperand(i_nocapture: 0));
2122 if (IndexLoad)
2123 return IndexLoad;
2124 auto *IE = dyn_cast<InsertElementInst>(Val: BaseShuffle->getOperand(i_nocapture: 0));
2125 if (IE) {
2126 auto *Src = IE->getOperand(i_nocapture: 1);
2127 IndexLoad = dyn_cast<LoadInst>(Val: Src);
2128 if (IndexLoad)
2129 return IndexLoad;
2130 auto *Alloca = dyn_cast<AllocaInst>(Val: Src);
2131 if (Alloca)
2132 return Alloca;
2133 if (isa<Argument>(Val: Src)) {
2134 return Src;
2135 }
2136 if (isa<GlobalValue>(Val: Src)) {
2137 return Src;
2138 }
2139 }
2140 }
2141 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2142 return nullptr;
2143}
2144
2145static Type *getIndexType(Value *In) {
2146 if (!In)
2147 return nullptr;
2148
2149 if (isa<LoadInst>(Val: In) || isa<StoreInst>(Val: In))
2150 return getLoadStoreType(I: In);
2151
2152 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: In)) {
2153 if (II->getIntrinsicID() == Intrinsic::masked_load)
2154 return II->getType();
2155 if (II->getIntrinsicID() == Intrinsic::masked_store)
2156 return II->getOperand(i_nocapture: 0)->getType();
2157 }
2158 return In->getType();
2159}
2160
2161static Value *locateIndexesFromGEP(Value *In) {
2162 if (!In)
2163 return nullptr;
2164 if (isa<LoadInst>(Val: In))
2165 return In;
2166 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: In)) {
2167 if (II->getIntrinsicID() == Intrinsic::masked_load)
2168 return In;
2169 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2170 return In;
2171 }
2172 if (auto *IndexZEx = dyn_cast<ZExtInst>(Val: In))
2173 return locateIndexesFromGEP(In: IndexZEx->getOperand(i_nocapture: 0));
2174 if (auto *IndexSEx = dyn_cast<SExtInst>(Val: In))
2175 return locateIndexesFromGEP(In: IndexSEx->getOperand(i_nocapture: 0));
2176 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(Val: In))
2177 return locateIndexesFromGEP(In: BaseShuffle->getOperand(i_nocapture: 0));
2178 if (auto *IE = dyn_cast<InsertElementInst>(Val: In))
2179 return locateIndexesFromGEP(In: IE->getOperand(i_nocapture: 1));
2180 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: In))
2181 return cstDataVector;
2182 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(Val: In))
2183 return GEPIndex->getOperand(i_nocapture: 0);
2184 return nullptr;
2185}
2186
2187// Given the intrinsic find its GEP argument and extract offsetts from the base
2188// address it uses.
2189static Value *locateIndexesFromIntrinsic(Instruction *In) {
2190 GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
2191 if (!GEPIndex) {
2192 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2193 return nullptr;
2194 }
2195 Value *Indexes = GEPIndex->getOperand(i_nocapture: 1);
2196 if (auto *IndexLoad = locateIndexesFromGEP(In: Indexes))
2197 return IndexLoad;
2198
2199 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2200 return nullptr;
2201}
2202
2203// Because of aukward definition of many Hex intrinsics we often have to
2204// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2205// for all use cases, so this only exist to make IR builder happy.
2206inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2207 IRBuilderBase &Builder,
2208 LLVMContext &Ctx, Value *I) {
2209 assert(I && "Unable to reinterprete cast");
2210 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2211 std::vector<unsigned> shuffleMask;
2212 for (unsigned i = 0; i < 64; ++i)
2213 shuffleMask.push_back(x: i);
2214 Constant *Mask = llvm::ConstantDataVector::get(Context&: Ctx, Elts: shuffleMask);
2215 Value *CastShuffle =
2216 Builder.CreateShuffleVector(V1: I, V2: I, Mask, Name: "identity_shuffle");
2217 return Builder.CreateBitCast(V: CastShuffle, DestTy: NT, Name: "cst64_i16_to_32_i32");
2218}
2219
2220// Recast <128 x i8> as <32 x i32>
2221inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2222 IRBuilderBase &Builder,
2223 LLVMContext &Ctx, Value *I) {
2224 assert(I && "Unable to reinterprete cast");
2225 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2226 std::vector<unsigned> shuffleMask;
2227 for (unsigned i = 0; i < 128; ++i)
2228 shuffleMask.push_back(x: i);
2229 Constant *Mask = llvm::ConstantDataVector::get(Context&: Ctx, Elts: shuffleMask);
2230 Value *CastShuffle =
2231 Builder.CreateShuffleVector(V1: I, V2: I, Mask, Name: "identity_shuffle");
2232 return Builder.CreateBitCast(V: CastShuffle, DestTy: NT, Name: "cst128_i8_to_32_i32");
2233}
2234
2235// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2236inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2237 IRBuilderBase &Builder, LLVMContext &Ctx,
2238 unsigned int pattern) {
2239 std::vector<unsigned int> byteMask;
2240 for (unsigned i = 0; i < 32; ++i)
2241 byteMask.push_back(x: pattern);
2242
2243 return Builder.CreateIntrinsic(
2244 RetTy: HVC.getBoolTy(ElemCount: 128), ID: HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vandvrt),
2245 Args: {llvm::ConstantDataVector::get(Context&: Ctx, Elts: byteMask), HVC.getConstInt(Val: ~0)},
2246 FMFSource: nullptr);
2247}
2248
2249Value *HvxIdioms::processVScatter(Instruction &In) const {
2250 auto *InpTy = dyn_cast<VectorType>(Val: In.getOperand(i: 0)->getType());
2251 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2252 unsigned InpSize = HVC.getSizeOf(Ty: InpTy);
2253 auto *F = In.getFunction();
2254 LLVMContext &Ctx = F->getContext();
2255 auto *ElemTy = dyn_cast<IntegerType>(Val: InpTy->getElementType());
2256 assert(ElemTy && "llvm.scatter needs integer type argument");
2257 unsigned ElemWidth = HVC.DL.getTypeAllocSize(Ty: ElemTy);
2258 LLVM_DEBUG({
2259 unsigned Elements = HVC.length(InpTy);
2260 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2261 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2262 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2263 << ElemWidth << ")\n";
2264 });
2265
2266 IRBuilder Builder(In.getParent(), In.getIterator(),
2267 InstSimplifyFolder(HVC.DL));
2268
2269 auto *ValueToScatter = In.getOperand(i: 0);
2270 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2271
2272 if (HVC.HST.getVectorLength() != InpSize) {
2273 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2274 << ") for vscatter\n");
2275 return nullptr;
2276 }
2277
2278 // Base address of indexes.
2279 auto *IndexLoad = locateAddressFromIntrinsic(In: &In);
2280 if (!IndexLoad)
2281 return nullptr;
2282 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2283
2284 // Address of destination. Must be in VTCM.
2285 auto *Ptr = getPointer(Ptr: IndexLoad);
2286 if (!Ptr)
2287 return nullptr;
2288 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2289 // Indexes/offsets
2290 auto *Indexes = locateIndexesFromIntrinsic(In: &In);
2291 if (!Indexes)
2292 return nullptr;
2293 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2294 Value *CastedDst = Builder.CreateBitOrPointerCast(V: Ptr, DestTy: Type::getInt32Ty(C&: Ctx),
2295 Name: "cst_ptr_to_i32");
2296 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2297 // Adjust Indexes
2298 auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: Indexes);
2299 Value *CastIndex = nullptr;
2300 if (cstDataVector) {
2301 // Our indexes are represented as a constant. We need it in a reg.
2302 Type *IndexVectorType = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2303 AllocaInst *IndexesAlloca = Builder.CreateAlloca(Ty: IndexVectorType);
2304 [[maybe_unused]] auto *StoreIndexes =
2305 Builder.CreateStore(Val: cstDataVector, Ptr: IndexesAlloca);
2306 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2307 CastIndex =
2308 Builder.CreateLoad(Ty: IndexVectorType, Ptr: IndexesAlloca, Name: "reload_index");
2309 } else {
2310 if (ElemWidth == 2)
2311 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: Indexes);
2312 else
2313 CastIndex = Indexes;
2314 }
2315 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2316
2317 if (ElemWidth == 1) {
2318 // v128i8 There is no native instruction for this.
2319 // Do this as two Hi/Lo gathers with masking.
2320 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2321 // Extend indexes. We assume that indexes are in 128i8 format - need to
2322 // expand them to Hi/Lo 64i16
2323 Value *CastIndexes = Builder.CreateBitCast(V: CastIndex, DestTy: NT, Name: "cast_to_32i32");
2324 auto V6_vunpack = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vunpackub);
2325 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2326 RetTy: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: true), ID: V6_vunpack, Args: CastIndexes, FMFSource: nullptr);
2327 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2328
2329 auto V6_hi = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_hi);
2330 auto V6_lo = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_lo);
2331 [[maybe_unused]] Value *IndexHi =
2332 HVC.createHvxIntrinsic(Builder, IntID: V6_hi, RetTy: NT, Args: UnpackedIndexes);
2333 [[maybe_unused]] Value *IndexLo =
2334 HVC.createHvxIntrinsic(Builder, IntID: V6_lo, RetTy: NT, Args: UnpackedIndexes);
2335 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2336 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2337 // Now unpack values to scatter
2338 Value *CastSrc =
2339 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, I: ValueToScatter);
2340 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2341 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2342 RetTy: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: true), ID: V6_vunpack, Args: CastSrc, FMFSource: nullptr);
2343 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2344 << ")\n");
2345
2346 [[maybe_unused]] Value *UVSHi =
2347 HVC.createHvxIntrinsic(Builder, IntID: V6_hi, RetTy: NT, Args: UnpackedValueToScatter);
2348 [[maybe_unused]] Value *UVSLo =
2349 HVC.createHvxIntrinsic(Builder, IntID: V6_lo, RetTy: NT, Args: UnpackedValueToScatter);
2350 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2351 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2352
2353 // Create the mask for individual bytes
2354 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, pattern: 0x00ff00ff);
2355 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2356 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2357 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermhq_128B,
2358 Args: {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2359 IndexHi, UVSHi},
2360 FMFSource: nullptr);
2361 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2362 return Builder.CreateIntrinsic(
2363 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermhq_128B,
2364 Args: {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2365 IndexLo, UVSLo},
2366 FMFSource: nullptr);
2367 } else if (ElemWidth == 2) {
2368 Value *CastSrc =
2369 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: ValueToScatter);
2370 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2371 return Builder.CreateIntrinsic(
2372 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermh_128B,
2373 Args: {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2374 CastSrc},
2375 FMFSource: nullptr);
2376 } else if (ElemWidth == 4) {
2377 return Builder.CreateIntrinsic(
2378 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermw_128B,
2379 Args: {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2380 ValueToScatter},
2381 FMFSource: nullptr);
2382 } else {
2383 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2384 return nullptr;
2385 }
2386}
2387
2388Value *HvxIdioms::processVGather(Instruction &In) const {
2389 [[maybe_unused]] auto *InpTy =
2390 dyn_cast<VectorType>(Val: In.getOperand(i: 0)->getType());
2391 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2392 [[maybe_unused]] auto *ElemTy =
2393 dyn_cast<PointerType>(Val: InpTy->getElementType());
2394 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2395 auto *F = In.getFunction();
2396 LLVMContext &Ctx = F->getContext();
2397 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2398 << *In.getParent() << "\n");
2399 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2400 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2401 << ") type(" << *ElemTy << ") Access alignment("
2402 << *In.getOperand(1) << ") AddressSpace("
2403 << ElemTy->getAddressSpace() << ")\n");
2404
2405 // TODO: Handle masking of elements.
2406 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2407 "llvm.gather needs vector for mask");
2408 IRBuilder Builder(In.getParent(), In.getIterator(),
2409 InstSimplifyFolder(HVC.DL));
2410
2411 // See who is using the result. The difference between LLVM and HVX vgather
2412 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2413 // in VTCM is not yet supported, so for now we just bail out for those cases.
2414 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2415 Instruction *Dst = locateDestination(In: &In, Qual);
2416 if (!Dst) {
2417 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2418 return nullptr;
2419 }
2420 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2421 << ")\n");
2422
2423 // Address of destination. Must be in VTCM.
2424 auto *Ptr = getPointer(Ptr: Dst);
2425 if (!Ptr) {
2426 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2427 return nullptr;
2428 }
2429
2430 // Result type. Assume it is a vector type.
2431 auto *DstType = cast<VectorType>(Val: getIndexType(In: Dst));
2432 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2433
2434 // Base address for sources to be loaded
2435 auto *IndexLoad = locateAddressFromIntrinsic(In: &In);
2436 if (!IndexLoad)
2437 return nullptr;
2438 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2439
2440 // Gather indexes/offsets
2441 auto *Indexes = locateIndexesFromIntrinsic(In: &In);
2442 if (!Indexes)
2443 return nullptr;
2444 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2445
2446 Value *Gather = nullptr;
2447 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2448 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2449 // We fully assume the address space is in VTCM. We also assume that all
2450 // pointers in Operand(0) have the same base(!).
2451 // This is the most basic case of all the above.
2452 unsigned OutputSize = HVC.getSizeOf(Ty: DstType);
2453 auto *DstElemTy = cast<IntegerType>(Val: DstType->getElementType());
2454 unsigned ElemWidth = HVC.DL.getTypeAllocSize(Ty: DstElemTy);
2455 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2456 << " Address space ("
2457 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2458 << " Result type : " << *DstType
2459 << "\n Size in bytes : " << OutputSize
2460 << " element type(" << *DstElemTy
2461 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2462
2463 auto *IndexType = cast<VectorType>(Val: getIndexType(In: Indexes));
2464 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2465 unsigned IndexWidth = HVC.DL.getTypeAllocSize(Ty: IndexType->getElementType());
2466 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2467
2468 // Intrinsic takes i32 instead of pointer so cast.
2469 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2470 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2471 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2472 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2473 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2474 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2475 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2476 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2477 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2478 if (HVC.HST.getVectorLength() == OutputSize) {
2479 if (ElemWidth == 1) {
2480 // v128i8 There is no native instruction for this.
2481 // Do this as two Hi/Lo gathers with masking.
2482 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2483 // expand them to Hi/Lo 64i16
2484 Value *CastIndexes =
2485 Builder.CreateBitCast(V: Indexes, DestTy: NT, Name: "cast_to_32i32");
2486 auto V6_vunpack = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vunpackub);
2487 auto *UnpackedIndexes =
2488 Builder.CreateIntrinsic(RetTy: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: true),
2489 ID: V6_vunpack, Args: CastIndexes, FMFSource: nullptr);
2490 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2491 << ")\n");
2492
2493 auto V6_hi = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_hi);
2494 auto V6_lo = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_lo);
2495 [[maybe_unused]] Value *IndexHi =
2496 HVC.createHvxIntrinsic(Builder, IntID: V6_hi, RetTy: NT, Args: UnpackedIndexes);
2497 [[maybe_unused]] Value *IndexLo =
2498 HVC.createHvxIntrinsic(Builder, IntID: V6_lo, RetTy: NT, Args: UnpackedIndexes);
2499 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2500 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2501 // Create the mask for individual bytes
2502 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, pattern: 0x00ff00ff);
2503 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2504 // We use our destination allocation as a temp storage
2505 // This is unlikely to work properly for masked gather.
2506 auto V6_vgather = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vgathermhq);
2507 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2508 RetTy: Type::getVoidTy(C&: Ctx), ID: V6_vgather,
2509 Args: {Ptr, QByteMask, CastedPtr,
2510 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2511 FMFSource: nullptr);
2512 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2513 // Rematerialize the result
2514 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2515 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false), Ptr, Name: "temp_result_hi");
2516 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2517 // Same for the low part. Here we use Gather to return non-NULL result
2518 // from this function and continue to iterate. We also are deleting Dst
2519 // store below.
2520 Gather = Builder.CreateIntrinsic(
2521 RetTy: Type::getVoidTy(C&: Ctx), ID: V6_vgather,
2522 Args: {Ptr, QByteMask, CastedPtr,
2523 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2524 FMFSource: nullptr);
2525 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2526 Value *LoadedResultLo = Builder.CreateLoad(
2527 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false), Ptr, Name: "temp_result_lo");
2528 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2529 // Now we have properly sized bytes in every other position
2530 // B b A a c a A b B c f F g G h H is presented as
2531 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2532 // Use vpack to gather them
2533 auto V6_vpackeb = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vpackeb);
2534 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2535 RetTy: NT, ID: V6_vpackeb, Args: {LoadedResultHi, LoadedResultLo}, FMFSource: nullptr);
2536 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2537 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Val: Res, Ptr);
2538 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2539 } else if (ElemWidth == 2) {
2540 // v32i16
2541 if (IndexWidth == 2) {
2542 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2543 Value *CastIndex =
2544 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: Indexes);
2545 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2546 // shift all i16 left by 1 to match short addressing mode instead of
2547 // byte.
2548 auto V6_vaslh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaslh);
2549 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2550 Builder, IntID: V6_vaslh, RetTy: NT, Args: {CastIndex, HVC.getConstInt(Val: 1)});
2551 LLVM_DEBUG(dbgs()
2552 << " Shifted half index: " << *AdjustedIndex << ")\n");
2553
2554 auto V6_vgather = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vgathermh);
2555 // The 3rd argument is the size of the region to gather from. Probably
2556 // want to set it to max VTCM size.
2557 Gather = Builder.CreateIntrinsic(
2558 RetTy: Type::getVoidTy(C&: Ctx), ID: V6_vgather,
2559 Args: {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2560 AdjustedIndex},
2561 FMFSource: nullptr);
2562 for (auto &U : Dst->uses()) {
2563 if (auto *UI = dyn_cast<Instruction>(Val: U.getUser()))
2564 dbgs() << " dst used by: " << *UI << "\n";
2565 }
2566 for (auto &U : In.uses()) {
2567 if (auto *UI = dyn_cast<Instruction>(Val: U.getUser()))
2568 dbgs() << " In used by : " << *UI << "\n";
2569 }
2570 // Create temp load from result in case the result is used by any
2571 // other instruction.
2572 Value *LoadedResult = Builder.CreateLoad(
2573 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr, Name: "temp_result");
2574 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2575 In.replaceAllUsesWith(V: LoadedResult);
2576 } else {
2577 dbgs() << " Unhandled index type for vgather\n";
2578 return nullptr;
2579 }
2580 } else if (ElemWidth == 4) {
2581 if (IndexWidth == 4) {
2582 // v32i32
2583 auto V6_vaslh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaslh);
2584 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2585 Builder, IntID: V6_vaslh, RetTy: NT, Args: {Indexes, HVC.getConstInt(Val: 2)});
2586 LLVM_DEBUG(dbgs()
2587 << " Shifted word index: " << *AdjustedIndex << ")\n");
2588 Gather = Builder.CreateIntrinsic(
2589 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermw_128B,
2590 Args: {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2591 AdjustedIndex},
2592 FMFSource: nullptr);
2593 } else {
2594 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2595 return nullptr;
2596 }
2597 } else {
2598 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2599 return nullptr;
2600 }
2601 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2602 // This is half of the reg width, duplicate low in high
2603 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2604 return nullptr;
2605 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2606 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2607 return nullptr;
2608 }
2609 // Erase the original intrinsic and store that consumes it.
2610 // HVX will create a pseudo for gather that is expanded to gather + store
2611 // during packetization.
2612 Dst->eraseFromParent();
2613 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2614 // Gather feeds directly into scatter.
2615 LLVM_DEBUG({
2616 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2617 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2618 unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2619 unsigned DstElements = HVC.length(DstInpTy);
2620 auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
2621 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2622 dbgs() << " Gather feeds into scatter\n Values to scatter : "
2623 << *Dst->getOperand(0) << "\n";
2624 dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements
2625 << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
2626 << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
2627 });
2628 // Address of source
2629 auto *Src = getPointer(Ptr: IndexLoad);
2630 if (!Src)
2631 return nullptr;
2632 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2633
2634 if (!isa<PointerType>(Val: Src->getType())) {
2635 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2636 return nullptr;
2637 }
2638
2639 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2640 V: Src, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2641 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2642
2643 auto *DstLoad = locateAddressFromIntrinsic(In: Dst);
2644 if (!DstLoad) {
2645 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2646 return nullptr;
2647 }
2648 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2649
2650 Value *Ptr = getPointer(Ptr: DstLoad);
2651 if (!Ptr)
2652 return nullptr;
2653 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2654 Value *CastIndex =
2655 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: IndexLoad);
2656 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2657 // Shift all i16 left by 1 to match short addressing mode instead of
2658 // byte.
2659 auto V6_vaslh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaslh);
2660 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2661 Builder, IntID: V6_vaslh, RetTy: NT, Args: {CastIndex, HVC.getConstInt(Val: 1)});
2662 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2663
2664 return Builder.CreateIntrinsic(
2665 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2666 Args: {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2667 AdjustedIndex},
2668 FMFSource: nullptr);
2669 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2670 // Gather feeds into previously inserted pseudo intrinsic.
2671 // These could not be in the same packet, so we need to generate another
2672 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2673 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2674 // ModRegs:$Mu, HvxVR:$Vv)
2675 if (isa<AllocaInst>(Val: IndexLoad)) {
2676 auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: Indexes);
2677 if (cstDataVector) {
2678 // Our indexes are represented as a constant. We need THEM in a reg.
2679 // This most likely will not work properly since alloca gives us DDR
2680 // stack location. This will be fixed once we teach compiler about VTCM.
2681 AllocaInst *IndexesAlloca = Builder.CreateAlloca(Ty: NT);
2682 [[maybe_unused]] auto *StoreIndexes =
2683 Builder.CreateStore(Val: cstDataVector, Ptr: IndexesAlloca);
2684 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2685 Value *LoadedIndex =
2686 Builder.CreateLoad(Ty: NT, Ptr: IndexesAlloca, Name: "reload_index");
2687 AllocaInst *ResultAlloca = Builder.CreateAlloca(Ty: NT);
2688 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2689
2690 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2691 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2692 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2693
2694 Gather = Builder.CreateIntrinsic(
2695 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2696 Args: {ResultAlloca, CastedSrc,
2697 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2698 FMFSource: nullptr);
2699 Value *LoadedResult = Builder.CreateLoad(
2700 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr: ResultAlloca, Name: "temp_result");
2701 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2702 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2703 In.replaceAllUsesWith(V: LoadedResult);
2704 }
2705 } else {
2706 // Address of source
2707 auto *Src = getPointer(Ptr: IndexLoad);
2708 if (!Src)
2709 return nullptr;
2710 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2711
2712 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2713 V: Src, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2714 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2715
2716 auto *DstLoad = locateAddressFromIntrinsic(In: Dst);
2717 if (!DstLoad)
2718 return nullptr;
2719 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2720 auto *Ptr = getPointer(Ptr: DstLoad);
2721 if (!Ptr)
2722 return nullptr;
2723 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2724
2725 Gather = Builder.CreateIntrinsic(
2726 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgather_vscattermh,
2727 Args: {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2728 Indexes},
2729 FMFSource: nullptr);
2730 }
2731 return Gather;
2732 } else if (Qual == HvxIdioms::HEX_Scatter) {
2733 // This is the case when result of a gather is used as an argument to
2734 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2735 // ourselves. We have to create alloca, store to it, and replace all uses
2736 // with that.
2737 AllocaInst *ResultAlloca = Builder.CreateAlloca(Ty: NT);
2738 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2739 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2740 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2741 Value *CastIndex =
2742 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: Indexes);
2743 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2744
2745 Gather = Builder.CreateIntrinsic(
2746 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2747 Args: {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2748 CastIndex},
2749 FMFSource: nullptr);
2750 Value *LoadedResult = Builder.CreateLoad(
2751 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr: ResultAlloca, Name: "temp_result");
2752 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2753 In.replaceAllUsesWith(V: LoadedResult);
2754 } else if (Qual == HvxIdioms::HEX_Gather) {
2755 // Gather feeds to another gather but already replaced with
2756 // hexagon_V6_vgathermh_128B
2757 if (isa<AllocaInst>(Val: IndexLoad)) {
2758 auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: Indexes);
2759 if (cstDataVector) {
2760 // Our indexes are represented as a constant. We need it in a reg.
2761 AllocaInst *IndexesAlloca = Builder.CreateAlloca(Ty: NT);
2762
2763 [[maybe_unused]] auto *StoreIndexes =
2764 Builder.CreateStore(Val: cstDataVector, Ptr: IndexesAlloca);
2765 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2766 Value *LoadedIndex =
2767 Builder.CreateLoad(Ty: NT, Ptr: IndexesAlloca, Name: "reload_index");
2768 AllocaInst *ResultAlloca = Builder.CreateAlloca(Ty: NT);
2769 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2770 << "\n AddressSpace: "
2771 << ResultAlloca->getAddressSpace() << "\n";);
2772
2773 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2774 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2775 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2776
2777 Gather = Builder.CreateIntrinsic(
2778 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2779 Args: {ResultAlloca, CastedSrc,
2780 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2781 FMFSource: nullptr);
2782 Value *LoadedResult = Builder.CreateLoad(
2783 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr: ResultAlloca, Name: "temp_result");
2784 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2785 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2786 In.replaceAllUsesWith(V: LoadedResult);
2787 }
2788 }
2789 } else if (Qual == HvxIdioms::LLVM_Gather) {
2790 // Gather feeds into another gather
2791 errs() << " Underimplemented vgather to vgather sequence\n";
2792 return nullptr;
2793 } else
2794 llvm_unreachable("Unhandled Qual enum");
2795
2796 return Gather;
2797}
2798
2799// Go through all PHI incomming values and find minimal alignment for non GEP
2800// members.
2801std::optional<uint64_t> HvxIdioms::getPHIBaseMinAlignment(Instruction &In,
2802 PHINode *PN) const {
2803 if (!PN)
2804 return std::nullopt;
2805
2806 SmallVector<Value *, 16> Worklist;
2807 SmallPtrSet<Value *, 16> Visited;
2808 uint64_t minPHIAlignment = Value::MaximumAlignment;
2809 Worklist.push_back(Elt: PN);
2810
2811 while (!Worklist.empty()) {
2812 Value *V = Worklist.back();
2813 Worklist.pop_back();
2814 if (!Visited.insert(Ptr: V).second)
2815 continue;
2816
2817 if (PHINode *PN = dyn_cast<PHINode>(Val: V)) {
2818 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2819 Worklist.push_back(Elt: PN->getIncomingValue(i));
2820 }
2821 } else if (isa<GetElementPtrInst>(Val: V)) {
2822 // Ignore geps for now.
2823 continue;
2824 } else {
2825 Align KnownAlign = getKnownAlignment(V, DL: HVC.DL, CxtI: &In, AC: &HVC.AC, DT: &HVC.DT);
2826 if (KnownAlign.value() < minPHIAlignment)
2827 minPHIAlignment = KnownAlign.value();
2828 }
2829 }
2830 if (minPHIAlignment != Value::MaximumAlignment)
2831 return minPHIAlignment;
2832 return std::nullopt;
2833}
2834
2835// Helper function to discover alignment for a ptr.
2836std::optional<uint64_t> HvxIdioms::getAlignment(Instruction &In,
2837 Value *ptr) const {
2838 SmallPtrSet<Value *, 16> Visited;
2839 return getAlignmentImpl(In, ptr, Visited);
2840}
2841
2842std::optional<uint64_t>
2843HvxIdioms::getAlignmentImpl(Instruction &In, Value *ptr,
2844 SmallPtrSet<Value *, 16> &Visited) const {
2845 LLVM_DEBUG(dbgs() << "[getAlignment] for : " << *ptr << "\n");
2846 // Prevent infinite recursion
2847 if (!Visited.insert(Ptr: ptr).second)
2848 return std::nullopt;
2849 // Try AssumptionCache.
2850 Align KnownAlign = getKnownAlignment(V: ptr, DL: HVC.DL, CxtI: &In, AC: &HVC.AC, DT: &HVC.DT);
2851 // This is the most formal and reliable source of information.
2852 if (KnownAlign.value() > 1) {
2853 LLVM_DEBUG(dbgs() << " VC align(" << KnownAlign.value() << ")\n");
2854 return KnownAlign.value();
2855 }
2856
2857 // If it is a PHI try to iterate through inputs
2858 if (PHINode *PN = dyn_cast<PHINode>(Val: ptr)) {
2859 // See if we have a common base to which we know alignment.
2860 auto baseAlignmentOpt = getPHIBaseMinAlignment(In, PN);
2861 if (!baseAlignmentOpt)
2862 return std::nullopt;
2863
2864 uint64_t minBaseAlignment = *baseAlignmentOpt;
2865 // If it is 1, there is no point to keep on looking.
2866 if (minBaseAlignment == 1)
2867 return 1;
2868 // No see if all other incomming phi nodes are just loop carried constants.
2869 uint64_t minPHIAlignment = minBaseAlignment;
2870 LLVM_DEBUG(dbgs() << " It is a PHI with(" << PN->getNumIncomingValues()
2871 << ")nodes and min base aligned to (" << minBaseAlignment
2872 << ")\n");
2873 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2874 Value *IV = PN->getIncomingValue(i);
2875 // We have already looked at all other values.
2876 if (!isa<GetElementPtrInst>(Val: IV))
2877 continue;
2878 uint64_t MemberAlignment = Value::MaximumAlignment;
2879 if (auto res = getAlignment(In&: *PN, ptr: IV))
2880 MemberAlignment = *res;
2881 else
2882 return std::nullopt;
2883 // Adjust total PHI alignment.
2884 if (minPHIAlignment > MemberAlignment)
2885 minPHIAlignment = MemberAlignment;
2886 }
2887 LLVM_DEBUG(dbgs() << " total PHI alignment(" << minPHIAlignment << ")\n");
2888 return minPHIAlignment;
2889 }
2890
2891 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: ptr)) {
2892 auto *GEPPtr = GEP->getPointerOperand();
2893 // Only if this is the induction variable with const offset
2894 // Implicit assumption is that induction variable itself is a PHI
2895 if (&In == GEPPtr) {
2896 APInt Offset(HVC.DL.getPointerSizeInBits(
2897 AS: GEPPtr->getType()->getPointerAddressSpace()),
2898 0);
2899 if (GEP->accumulateConstantOffset(DL: HVC.DL, Offset)) {
2900 LLVM_DEBUG(dbgs() << " Induction GEP with const step of ("
2901 << Offset.getZExtValue() << ")\n");
2902 return Offset.getZExtValue();
2903 }
2904 }
2905 }
2906
2907 return std::nullopt;
2908}
2909
2910Value *HvxIdioms::processMStore(Instruction &In) const {
2911 [[maybe_unused]] auto *InpTy =
2912 dyn_cast<VectorType>(Val: In.getOperand(i: 0)->getType());
2913 assert(InpTy && "Cannot handle no vector type for llvm.masked.store");
2914
2915 LLVM_DEBUG(dbgs() << "\n[Process mstore](" << In << ")\n"
2916 << *In.getParent() << "\n");
2917 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2918 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2919 << ") type(" << *InpTy->getElementType() << ") of size("
2920 << InpTy->getScalarSizeInBits() << ")bits\n");
2921 auto *CI = dyn_cast<CallBase>(Val: &In);
2922 assert(CI && "Expected llvm.masked.store to be a call");
2923 Align HaveAlign = CI->getParamAlign(ArgNo: 1).valueOrOne();
2924
2925 uint64_t KA = 1;
2926 if (auto res = getAlignment(In, ptr: In.getOperand(i: 1))) // ptr operand
2927 KA = *res;
2928 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2929 << KA << ")\n");
2930 // Normalize 0 -> ABI alignment of the stored value type (operand 0).
2931 Type *ValTy = In.getOperand(i: 0)->getType();
2932 Align EffA =
2933 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(Ty: ValTy).value());
2934
2935 if (EffA < HaveAlign)
2936 return nullptr;
2937
2938 // Attach/replace the param attribute on pointer param #1.
2939 AttrBuilder AttrB(CI->getContext());
2940 AttrB.addAlignmentAttr(Align: EffA);
2941 CI->setAttributes(
2942 CI->getAttributes().addParamAttributes(C&: CI->getContext(), ArgNo: 1, B: AttrB));
2943 return CI;
2944}
2945
2946Value *HvxIdioms::processMLoad(Instruction &In) const {
2947 [[maybe_unused]] auto *InpTy = dyn_cast<VectorType>(Val: In.getType());
2948 assert(InpTy && "Cannot handle non vector type for llvm.masked.store");
2949 LLVM_DEBUG(dbgs() << "\n[Process mload](" << In << ")\n"
2950 << *In.getParent() << "\n");
2951 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2952 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2953 << ") type(" << *InpTy->getElementType() << ") of size("
2954 << InpTy->getScalarSizeInBits() << ")bits\n");
2955 auto *CI = dyn_cast<CallBase>(Val: &In);
2956 assert(CI && "Expected to be a call to llvm.masked.load");
2957 // The pointer is operand #0, and its param attribute index is also 0.
2958 Align HaveAlign = CI->getParamAlign(ArgNo: 0).valueOrOne();
2959
2960 // Compute best-known alignment KA from analysis.
2961 uint64_t KA = 1;
2962 if (auto res = getAlignment(In, ptr: In.getOperand(i: 0))) // ptr operand
2963 KA = *res;
2964
2965 // Normalize 0 → ABI alignment of the loaded value type.
2966 Type *ValTy = In.getType();
2967 Align EffA =
2968 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(Ty: ValTy).value());
2969 if (EffA < HaveAlign)
2970 return nullptr;
2971 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2972 << KA << ")\n");
2973
2974 // Attach/replace the param attribute on pointer param #0.
2975 AttrBuilder AttrB(CI->getContext());
2976 AttrB.addAlignmentAttr(Align: EffA);
2977 CI->setAttributes(
2978 CI->getAttributes().addParamAttributes(C&: CI->getContext(), ArgNo: 0, B: AttrB));
2979 return CI;
2980}
2981
2982auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2983 const FxpOp &Op) const -> Value * {
2984 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2985 auto *InpTy = cast<VectorType>(Val: Op.X.Val->getType());
2986 unsigned Width = InpTy->getScalarSizeInBits();
2987 bool Rounding = Op.RoundAt.has_value();
2988
2989 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2990 // The fixed-point intrinsics do signed multiplication.
2991 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2992 Value *QMul = nullptr;
2993 if (Width == 16) {
2994 QMul = createMulQ15(Builder, X: Op.X, Y: Op.Y, Rounding);
2995 } else if (Width == 32) {
2996 QMul = createMulQ31(Builder, X: Op.X, Y: Op.Y, Rounding);
2997 }
2998 if (QMul != nullptr)
2999 return QMul;
3000 }
3001 }
3002
3003 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
3004 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
3005
3006 // If Width < 32, then it should really be 16.
3007 if (Width < 32) {
3008 if (Width < 16)
3009 return nullptr;
3010 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
3011 // generate a full precision products, which is unnecessary if there is
3012 // no shift.
3013 assert(Width == 16);
3014 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
3015 if (Op.Frac == 16) {
3016 // Multiply high
3017 if (Value *MulH = createMulH16(Builder, X: Op.X, Y: Op.Y))
3018 return MulH;
3019 }
3020 // Do full-precision multiply and shift.
3021 Value *Prod32 = createMul16(Builder, X: Op.X, Y: Op.Y);
3022 if (Rounding) {
3023 Value *RoundVal =
3024 ConstantInt::get(Ty: Prod32->getType(), V: 1ull << *Op.RoundAt);
3025 Prod32 = Builder.CreateAdd(LHS: Prod32, RHS: RoundVal, Name: "add");
3026 }
3027
3028 Value *ShiftAmt = ConstantInt::get(Ty: Prod32->getType(), V: Op.Frac);
3029 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
3030 ? Builder.CreateAShr(LHS: Prod32, RHS: ShiftAmt, Name: "asr")
3031 : Builder.CreateLShr(LHS: Prod32, RHS: ShiftAmt, Name: "lsr");
3032 return Builder.CreateTrunc(V: Shifted, DestTy: InpTy, Name: "trn");
3033 }
3034
3035 // Width >= 32
3036
3037 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
3038 // in preparation of doing the multiplication by 32-bit parts.
3039 auto WordX = HVC.splitVectorElements(Builder, Vec: Op.X.Val, /*ToWidth=*/32);
3040 auto WordY = HVC.splitVectorElements(Builder, Vec: Op.Y.Val, /*ToWidth=*/32);
3041 auto WordP = createMulLong(Builder, WordX, SgnX: Op.X.Sgn, WordY, SgnY: Op.Y.Sgn);
3042
3043 auto *HvxWordTy = cast<VectorType>(Val: WordP.front()->getType());
3044
3045 // Add the optional rounding to the proper word.
3046 if (Op.RoundAt.has_value()) {
3047 Value *Zero = Constant::getNullValue(Ty: WordX[0]->getType());
3048 SmallVector<Value *> RoundV(WordP.size(), Zero);
3049 RoundV[*Op.RoundAt / 32] =
3050 ConstantInt::get(Ty: HvxWordTy, V: 1ull << (*Op.RoundAt % 32));
3051 WordP = createAddLong(Builder, WordX: WordP, WordY: RoundV);
3052 }
3053
3054 // createRightShiftLong?
3055
3056 // Shift all products right by Op.Frac.
3057 unsigned SkipWords = Op.Frac / 32;
3058 Constant *ShiftAmt = ConstantInt::get(Ty: HvxWordTy, V: Op.Frac % 32);
3059
3060 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
3061 int Src = Dst + SkipWords;
3062 Value *Lo = WordP[Src];
3063 if (Src + 1 < End) {
3064 Value *Hi = WordP[Src + 1];
3065 WordP[Dst] = Builder.CreateIntrinsic(RetTy: HvxWordTy, ID: Intrinsic::fshr,
3066 Args: {Hi, Lo, ShiftAmt},
3067 /*FMFSource*/ nullptr, Name: "int");
3068 } else {
3069 // The shift of the most significant word.
3070 WordP[Dst] = Builder.CreateAShr(LHS: Lo, RHS: ShiftAmt, Name: "asr");
3071 }
3072 }
3073 if (SkipWords != 0)
3074 WordP.resize(N: WordP.size() - SkipWords);
3075
3076 return HVC.joinVectorElements(Builder, Values: WordP, ToType: Op.ResTy);
3077}
3078
3079auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
3080 bool Rounding) const -> Value * {
3081 assert(X.Val->getType() == Y.Val->getType());
3082 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
3083 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
3084
3085 // There is no non-rounding intrinsic for i16.
3086 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
3087 return nullptr;
3088
3089 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyhvsrs);
3090 return HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyhvsrs, RetTy: X.Val->getType(),
3091 Args: {X.Val, Y.Val});
3092}
3093
3094auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
3095 bool Rounding) const -> Value * {
3096 Type *InpTy = X.Val->getType();
3097 assert(InpTy == Y.Val->getType());
3098 assert(InpTy->getScalarType() == HVC.getIntTy(32));
3099 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
3100
3101 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
3102 return nullptr;
3103
3104 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyewuh);
3105 auto V6_vmpyo_acc = Rounding
3106 ? HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyowh_rnd_sacc)
3107 : HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyowh_sacc);
3108 Value *V1 =
3109 HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyewuh, RetTy: InpTy, Args: {X.Val, Y.Val});
3110 return HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyo_acc, RetTy: InpTy,
3111 Args: {V1, X.Val, Y.Val});
3112}
3113
3114auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
3115 Value *CarryIn) const
3116 -> std::pair<Value *, Value *> {
3117 assert(X->getType() == Y->getType());
3118 auto VecTy = cast<VectorType>(Val: X->getType());
3119 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
3120 SmallVector<Value *> Args = {X, Y};
3121 Intrinsic::ID AddCarry;
3122 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
3123 AddCarry = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaddcarryo);
3124 } else {
3125 AddCarry = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaddcarry);
3126 if (CarryIn == nullptr)
3127 CarryIn = Constant::getNullValue(Ty: HVC.getBoolTy(ElemCount: HVC.length(Ty: VecTy)));
3128 Args.push_back(Elt: CarryIn);
3129 }
3130 Value *Ret = HVC.createHvxIntrinsic(Builder, IntID: AddCarry,
3131 /*RetTy=*/nullptr, Args);
3132 Value *Result = Builder.CreateExtractValue(Agg: Ret, Idxs: {0}, Name: "ext");
3133 Value *CarryOut = Builder.CreateExtractValue(Agg: Ret, Idxs: {1}, Name: "ext");
3134 return {Result, CarryOut};
3135 }
3136
3137 // In other cases, do a regular add, and unsigned compare-less-than.
3138 // The carry-out can originate in two places: adding the carry-in or adding
3139 // the two input values.
3140 Value *Result1 = X; // Result1 = X + CarryIn
3141 if (CarryIn != nullptr) {
3142 unsigned Width = VecTy->getScalarSizeInBits();
3143 uint32_t Mask = 1;
3144 if (Width < 32) {
3145 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
3146 Mask = (Mask << Width) | 1;
3147 }
3148 auto V6_vandqrt = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vandqrt);
3149 Value *ValueIn =
3150 HVC.createHvxIntrinsic(Builder, IntID: V6_vandqrt, /*RetTy=*/nullptr,
3151 Args: {CarryIn, HVC.getConstInt(Val: Mask)});
3152 Result1 = Builder.CreateAdd(LHS: X, RHS: ValueIn, Name: "add");
3153 }
3154
3155 Value *CarryOut1 = Builder.CreateCmp(Pred: CmpInst::ICMP_ULT, LHS: Result1, RHS: X, Name: "cmp");
3156 Value *Result2 = Builder.CreateAdd(LHS: Result1, RHS: Y, Name: "add");
3157 Value *CarryOut2 = Builder.CreateCmp(Pred: CmpInst::ICMP_ULT, LHS: Result2, RHS: Y, Name: "cmp");
3158 return {Result2, Builder.CreateOr(LHS: CarryOut1, RHS: CarryOut2, Name: "orb")};
3159}
3160
3161auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
3162 -> Value * {
3163 Intrinsic::ID V6_vmpyh = 0;
3164 std::tie(args&: X, args&: Y) = canonSgn(X, Y);
3165
3166 if (X.Sgn == Signed) {
3167 V6_vmpyh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyhv);
3168 } else if (Y.Sgn == Signed) {
3169 // In vmpyhus the second operand is unsigned
3170 V6_vmpyh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyhus);
3171 } else {
3172 V6_vmpyh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyuhv);
3173 }
3174
3175 // i16*i16 -> i32 / interleaved
3176 Value *P =
3177 HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyh, RetTy: HvxP32Ty, Args: {Y.Val, X.Val});
3178 // Deinterleave
3179 return HVC.vshuff(Builder, Val0: HVC.sublo(Builder, Val: P), Val1: HVC.subhi(Builder, Val: P));
3180}
3181
3182auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
3183 -> Value * {
3184 Type *HvxI16Ty = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), /*Pair=*/false);
3185
3186 if (HVC.HST.useHVXV69Ops()) {
3187 if (X.Sgn != Signed && Y.Sgn != Signed) {
3188 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyuhvs);
3189 return HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyuhvs, RetTy: HvxI16Ty,
3190 Args: {X.Val, Y.Val});
3191 }
3192 }
3193
3194 Type *HvxP16Ty = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), /*Pair=*/true);
3195 Value *Pair16 =
3196 Builder.CreateBitCast(V: createMul16(Builder, X, Y), DestTy: HvxP16Ty, Name: "cst");
3197 unsigned Len = HVC.length(Ty: HvxP16Ty) / 2;
3198
3199 SmallVector<int, 128> PickOdd(Len);
3200 for (int i = 0; i != static_cast<int>(Len); ++i)
3201 PickOdd[i] = 2 * i + 1;
3202
3203 return Builder.CreateShuffleVector(
3204 V1: HVC.sublo(Builder, Val: Pair16), V2: HVC.subhi(Builder, Val: Pair16), Mask: PickOdd, Name: "shf");
3205}
3206
3207auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
3208 -> std::pair<Value *, Value *> {
3209 assert(X.Val->getType() == Y.Val->getType());
3210 assert(X.Val->getType() == HvxI32Ty);
3211
3212 Intrinsic::ID V6_vmpy_parts;
3213 std::tie(args&: X, args&: Y) = canonSgn(X, Y);
3214
3215 if (X.Sgn == Signed) {
3216 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
3217 } else if (Y.Sgn == Signed) {
3218 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
3219 } else {
3220 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
3221 }
3222
3223 Value *Parts = HVC.createHvxIntrinsic(Builder, IntID: V6_vmpy_parts, RetTy: nullptr,
3224 Args: {X.Val, Y.Val}, ArgTys: {HvxI32Ty});
3225 Value *Hi = Builder.CreateExtractValue(Agg: Parts, Idxs: {0}, Name: "ext");
3226 Value *Lo = Builder.CreateExtractValue(Agg: Parts, Idxs: {1}, Name: "ext");
3227 return {Lo, Hi};
3228}
3229
3230auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3231 ArrayRef<Value *> WordY) const
3232 -> SmallVector<Value *> {
3233 assert(WordX.size() == WordY.size());
3234 unsigned Idx = 0, Length = WordX.size();
3235 SmallVector<Value *> Sum(Length);
3236
3237 while (Idx != Length) {
3238 if (HVC.isZero(Val: WordX[Idx]))
3239 Sum[Idx] = WordY[Idx];
3240 else if (HVC.isZero(Val: WordY[Idx]))
3241 Sum[Idx] = WordX[Idx];
3242 else
3243 break;
3244 ++Idx;
3245 }
3246
3247 Value *Carry = nullptr;
3248 for (; Idx != Length; ++Idx) {
3249 std::tie(args&: Sum[Idx], args&: Carry) =
3250 createAddCarry(Builder, X: WordX[Idx], Y: WordY[Idx], CarryIn: Carry);
3251 }
3252
3253 // This drops the final carry beyond the highest word.
3254 return Sum;
3255}
3256
3257auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3258 Signedness SgnX, ArrayRef<Value *> WordY,
3259 Signedness SgnY) const -> SmallVector<Value *> {
3260 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
3261
3262 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
3263 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
3264 for (int i = 0, e = WordX.size(); i != e; ++i) {
3265 for (int j = 0, f = WordY.size(); j != f; ++j) {
3266 // Check the 4 halves that this multiplication can generate.
3267 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
3268 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
3269 auto [Lo, Hi] = createMul32(Builder, X: {.Val: WordX[i], .Sgn: SX}, Y: {.Val: WordY[j], .Sgn: SY});
3270 Products[i + j + 0].push_back(Elt: Lo);
3271 Products[i + j + 1].push_back(Elt: Hi);
3272 }
3273 }
3274
3275 Value *Zero = Constant::getNullValue(Ty: WordX[0]->getType());
3276
3277 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
3278 if (Vector.empty())
3279 return Zero;
3280 auto Last = Vector.back();
3281 Vector.pop_back();
3282 return Last;
3283 };
3284
3285 for (int i = 0, e = Products.size(); i != e; ++i) {
3286 while (Products[i].size() > 1) {
3287 Value *Carry = nullptr; // no carry-in
3288 for (int j = i; j != e; ++j) {
3289 auto &ProdJ = Products[j];
3290 auto [Sum, CarryOut] = createAddCarry(Builder, X: pop_back_or_zero(ProdJ),
3291 Y: pop_back_or_zero(ProdJ), CarryIn: Carry);
3292 ProdJ.insert(I: ProdJ.begin(), Elt: Sum);
3293 Carry = CarryOut;
3294 }
3295 }
3296 }
3297
3298 SmallVector<Value *> WordP;
3299 for (auto &P : Products) {
3300 assert(P.size() == 1 && "Should have been added together");
3301 WordP.push_back(Elt: P.front());
3302 }
3303
3304 return WordP;
3305}
3306
3307auto HvxIdioms::run() -> bool {
3308 bool Changed = false;
3309
3310 for (BasicBlock &B : HVC.F) {
3311 for (auto It = B.rbegin(); It != B.rend(); ++It) {
3312 if (auto Fxm = matchFxpMul(In&: *It)) {
3313 Value *New = processFxpMul(In&: *It, Op: *Fxm);
3314 // Always report "changed" for now.
3315 Changed = true;
3316 if (!New)
3317 continue;
3318 bool StartOver = !isa<Instruction>(Val: New);
3319 It->replaceAllUsesWith(V: New);
3320 RecursivelyDeleteTriviallyDeadInstructions(V: &*It, TLI: &HVC.TLI);
3321 It = StartOver ? B.rbegin()
3322 : cast<Instruction>(Val: New)->getReverseIterator();
3323 Changed = true;
3324 } else if (matchGather(In&: *It)) {
3325 Value *New = processVGather(In&: *It);
3326 if (!New)
3327 continue;
3328 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3329 // We replace original intrinsic with a new pseudo call.
3330 It->eraseFromParent();
3331 It = cast<Instruction>(Val: New)->getReverseIterator();
3332 RecursivelyDeleteTriviallyDeadInstructions(V: &*It, TLI: &HVC.TLI);
3333 Changed = true;
3334 } else if (matchScatter(In&: *It)) {
3335 Value *New = processVScatter(In&: *It);
3336 if (!New)
3337 continue;
3338 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3339 // We replace original intrinsic with a new pseudo call.
3340 It->eraseFromParent();
3341 It = cast<Instruction>(Val: New)->getReverseIterator();
3342 RecursivelyDeleteTriviallyDeadInstructions(V: &*It, TLI: &HVC.TLI);
3343 Changed = true;
3344 } else if (matchMLoad(In&: *It)) {
3345 Value *New = processMLoad(In&: *It);
3346 if (!New)
3347 continue;
3348 LLVM_DEBUG(dbgs() << " MLoad : " << *New << "\n");
3349 Changed = true;
3350 } else if (matchMStore(In&: *It)) {
3351 Value *New = processMStore(In&: *It);
3352 if (!New)
3353 continue;
3354 LLVM_DEBUG(dbgs() << " MStore : " << *New << "\n");
3355 Changed = true;
3356 }
3357 }
3358 }
3359
3360 return Changed;
3361}
3362
3363// --- End HvxIdioms
3364
3365auto HexagonVectorCombine::run() -> bool {
3366 if (DumpModule)
3367 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3368
3369 bool Changed = false;
3370 if (HST.useHVXOps()) {
3371 if (VAEnabled)
3372 Changed |= AlignVectors(*this).run();
3373 if (VIEnabled)
3374 Changed |= HvxIdioms(*this).run();
3375 }
3376
3377 if (DumpModule) {
3378 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3379 << " after HexagonVectorCombine\n"
3380 << *F.getParent();
3381 }
3382 return Changed;
3383}
3384
3385auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3386 return IntegerType::get(C&: F.getContext(), NumBits: Width);
3387}
3388
3389auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3390 assert(ElemCount >= 0);
3391 IntegerType *ByteTy = Type::getInt8Ty(C&: F.getContext());
3392 if (ElemCount == 0)
3393 return ByteTy;
3394 return VectorType::get(ElementType: ByteTy, NumElements: ElemCount, /*Scalable=*/false);
3395}
3396
3397auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3398 assert(ElemCount >= 0);
3399 IntegerType *BoolTy = Type::getInt1Ty(C&: F.getContext());
3400 if (ElemCount == 0)
3401 return BoolTy;
3402 return VectorType::get(ElementType: BoolTy, NumElements: ElemCount, /*Scalable=*/false);
3403}
3404
3405auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3406 -> ConstantInt * {
3407 return ConstantInt::getSigned(Ty: getIntTy(Width), V: Val);
3408}
3409
3410auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3411 if (auto *C = dyn_cast<Constant>(Val))
3412 return C->isNullValue();
3413 return false;
3414}
3415
3416auto HexagonVectorCombine::getIntValue(const Value *Val) const
3417 -> std::optional<APInt> {
3418 if (auto *CI = dyn_cast<ConstantInt>(Val))
3419 return CI->getValue();
3420 return std::nullopt;
3421}
3422
3423auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3424 return isa<UndefValue>(Val);
3425}
3426
3427auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3428 return Val == ConstantInt::getTrue(Ty: Val->getType());
3429}
3430
3431auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3432 return isZero(Val);
3433}
3434
3435auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3436 -> VectorType * {
3437 EVT ETy = EVT::getEVT(Ty: ElemTy, HandleUnknown: false);
3438 assert(ETy.isSimple() && "Invalid HVX element type");
3439 // Do not allow boolean types here: they don't have a fixed length.
3440 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3441 "Invalid HVX element type");
3442 unsigned HwLen = HST.getVectorLength();
3443 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3444 return VectorType::get(ElementType: ElemTy, NumElements: Pair ? 2 * NumElems : NumElems,
3445 /*Scalable=*/false);
3446}
3447
3448auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3449 -> int {
3450 return getSizeOf(Ty: Val->getType(), Kind);
3451}
3452
3453auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3454 -> int {
3455 auto *NcTy = const_cast<Type *>(Ty);
3456 switch (Kind) {
3457 case Store:
3458 return DL.getTypeStoreSize(Ty: NcTy).getFixedValue();
3459 case Alloc:
3460 return DL.getTypeAllocSize(Ty: NcTy).getFixedValue();
3461 }
3462 llvm_unreachable("Unhandled SizeKind enum");
3463}
3464
3465auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3466 // The actual type may be shorter than the HVX vector, so determine
3467 // the alignment based on subtarget info.
3468 if (HST.isTypeForHVX(VecTy: Ty))
3469 return HST.getVectorLength();
3470 return DL.getABITypeAlign(Ty).value();
3471}
3472
3473auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3474 return length(Ty: Val->getType());
3475}
3476
3477auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3478 auto *VecTy = dyn_cast<VectorType>(Val: Ty);
3479 assert(VecTy && "Must be a vector type");
3480 return VecTy->getElementCount().getFixedValue();
3481}
3482
3483auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3484 if (auto *In = dyn_cast<Instruction>(Val: V)) {
3485 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3486 return simplifyInstruction(I: In, Q);
3487 }
3488 return nullptr;
3489}
3490
3491// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3492auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3493 Value *Src, int Start, int Length,
3494 int Where) const -> Value * {
3495 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3496 int SrcLen = getSizeOf(Val: Src);
3497 int DstLen = getSizeOf(Val: Dst);
3498 assert(0 <= Start && Start + Length <= SrcLen);
3499 assert(0 <= Where && Where + Length <= DstLen);
3500
3501 int P2Len = PowerOf2Ceil(A: SrcLen | DstLen);
3502 auto *Poison = PoisonValue::get(T: getByteTy());
3503 Value *P2Src = vresize(Builder, Val: Src, NewSize: P2Len, Pad: Poison);
3504 Value *P2Dst = vresize(Builder, Val: Dst, NewSize: P2Len, Pad: Poison);
3505
3506 SmallVector<int, 256> SMask(P2Len);
3507 for (int i = 0; i != P2Len; ++i) {
3508 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3509 // Otherwise, pick Dst[i];
3510 SMask[i] =
3511 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3512 }
3513
3514 Value *P2Insert = Builder.CreateShuffleVector(V1: P2Dst, V2: P2Src, Mask: SMask, Name: "shf");
3515 return vresize(Builder, Val: P2Insert, NewSize: DstLen, Pad: Poison);
3516}
3517
3518auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3519 Value *Hi, Value *Amt) const -> Value * {
3520 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3521 if (isZero(Val: Amt))
3522 return Hi;
3523 int VecLen = getSizeOf(Val: Hi);
3524 if (auto IntAmt = getIntValue(Val: Amt))
3525 return getElementRange(Builder, Lo, Hi, Start: VecLen - IntAmt->getSExtValue(),
3526 Length: VecLen);
3527
3528 if (HST.isTypeForHVX(VecTy: Hi->getType())) {
3529 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3530 "Expecting an exact HVX type");
3531 return createHvxIntrinsic(Builder, IntID: HST.getIntrinsicId(Opc: Hexagon::V6_vlalignb),
3532 RetTy: Hi->getType(), Args: {Hi, Lo, Amt});
3533 }
3534
3535 if (VecLen == 4) {
3536 Value *Pair = concat(Builder, Vecs: {Lo, Hi});
3537 Value *Shift =
3538 Builder.CreateLShr(LHS: Builder.CreateShl(LHS: Pair, RHS: Amt, Name: "shl"), RHS: 32, Name: "lsr");
3539 Value *Trunc =
3540 Builder.CreateTrunc(V: Shift, DestTy: Type::getInt32Ty(C&: F.getContext()), Name: "trn");
3541 return Builder.CreateBitCast(V: Trunc, DestTy: Hi->getType(), Name: "cst");
3542 }
3543 if (VecLen == 8) {
3544 Value *Sub = Builder.CreateSub(LHS: getConstInt(Val: VecLen), RHS: Amt, Name: "sub");
3545 return vralignb(Builder, Lo, Hi, Amt: Sub);
3546 }
3547 llvm_unreachable("Unexpected vector length");
3548}
3549
3550auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3551 Value *Hi, Value *Amt) const -> Value * {
3552 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3553 if (isZero(Val: Amt))
3554 return Lo;
3555 int VecLen = getSizeOf(Val: Lo);
3556 if (auto IntAmt = getIntValue(Val: Amt))
3557 return getElementRange(Builder, Lo, Hi, Start: IntAmt->getSExtValue(), Length: VecLen);
3558
3559 if (HST.isTypeForHVX(VecTy: Lo->getType())) {
3560 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3561 "Expecting an exact HVX type");
3562 return createHvxIntrinsic(Builder, IntID: HST.getIntrinsicId(Opc: Hexagon::V6_valignb),
3563 RetTy: Lo->getType(), Args: {Hi, Lo, Amt});
3564 }
3565
3566 if (VecLen == 4) {
3567 Value *Pair = concat(Builder, Vecs: {Lo, Hi});
3568 Value *Shift = Builder.CreateLShr(LHS: Pair, RHS: Amt, Name: "lsr");
3569 Value *Trunc =
3570 Builder.CreateTrunc(V: Shift, DestTy: Type::getInt32Ty(C&: F.getContext()), Name: "trn");
3571 return Builder.CreateBitCast(V: Trunc, DestTy: Lo->getType(), Name: "cst");
3572 }
3573 if (VecLen == 8) {
3574 Type *Int64Ty = Type::getInt64Ty(C&: F.getContext());
3575 Value *Lo64 = Builder.CreateBitCast(V: Lo, DestTy: Int64Ty, Name: "cst");
3576 Value *Hi64 = Builder.CreateBitCast(V: Hi, DestTy: Int64Ty, Name: "cst");
3577 Value *Call = Builder.CreateIntrinsic(ID: Intrinsic::hexagon_S2_valignrb,
3578 Args: {Hi64, Lo64, Amt},
3579 /*FMFSource=*/nullptr, Name: "cup");
3580 return Builder.CreateBitCast(V: Call, DestTy: Lo->getType(), Name: "cst");
3581 }
3582 llvm_unreachable("Unexpected vector length");
3583}
3584
3585// Concatenates a sequence of vectors of the same type.
3586auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3587 ArrayRef<Value *> Vecs) const -> Value * {
3588 assert(!Vecs.empty());
3589 SmallVector<int, 256> SMask;
3590 std::vector<Value *> Work[2];
3591 int ThisW = 0, OtherW = 1;
3592
3593 Work[ThisW].assign(first: Vecs.begin(), last: Vecs.end());
3594 while (Work[ThisW].size() > 1) {
3595 auto *Ty = cast<VectorType>(Val: Work[ThisW].front()->getType());
3596 SMask.resize(N: length(Ty) * 2);
3597 std::iota(first: SMask.begin(), last: SMask.end(), value: 0);
3598
3599 Work[OtherW].clear();
3600 if (Work[ThisW].size() % 2 != 0)
3601 Work[ThisW].push_back(x: UndefValue::get(T: Ty));
3602 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3603 Value *Joined = Builder.CreateShuffleVector(
3604 V1: Work[ThisW][i], V2: Work[ThisW][i + 1], Mask: SMask, Name: "shf");
3605 Work[OtherW].push_back(x: Joined);
3606 }
3607 std::swap(a&: ThisW, b&: OtherW);
3608 }
3609
3610 // Since there may have been some undefs appended to make shuffle operands
3611 // have the same type, perform the last shuffle to only pick the original
3612 // elements.
3613 SMask.resize(N: Vecs.size() * length(Ty: Vecs.front()->getType()));
3614 std::iota(first: SMask.begin(), last: SMask.end(), value: 0);
3615 Value *Total = Work[ThisW].front();
3616 return Builder.CreateShuffleVector(V: Total, Mask: SMask, Name: "shf");
3617}
3618
3619auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3620 int NewSize, Value *Pad) const -> Value * {
3621 assert(isa<VectorType>(Val->getType()));
3622 auto *ValTy = cast<VectorType>(Val: Val->getType());
3623 assert(ValTy->getElementType() == Pad->getType());
3624
3625 int CurSize = length(Ty: ValTy);
3626 if (CurSize == NewSize)
3627 return Val;
3628 // Truncate?
3629 if (CurSize > NewSize)
3630 return getElementRange(Builder, Lo: Val, /*Ignored*/ Hi: Val, Start: 0, Length: NewSize);
3631 // Extend.
3632 SmallVector<int, 128> SMask(NewSize);
3633 std::iota(first: SMask.begin(), last: SMask.begin() + CurSize, value: 0);
3634 std::fill(first: SMask.begin() + CurSize, last: SMask.end(), value: CurSize);
3635 Value *PadVec = Builder.CreateVectorSplat(NumElts: CurSize, V: Pad, Name: "spt");
3636 return Builder.CreateShuffleVector(V1: Val, V2: PadVec, Mask: SMask, Name: "shf");
3637}
3638
3639auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3640 Type *FromTy, Type *ToTy) const -> Value * {
3641 // Mask is a vector <N x i1>, where each element corresponds to an
3642 // element of FromTy. Remap it so that each element will correspond
3643 // to an element of ToTy.
3644 assert(isa<VectorType>(Mask->getType()));
3645
3646 Type *FromSTy = FromTy->getScalarType();
3647 Type *ToSTy = ToTy->getScalarType();
3648 if (FromSTy == ToSTy)
3649 return Mask;
3650
3651 int FromSize = getSizeOf(Ty: FromSTy);
3652 int ToSize = getSizeOf(Ty: ToSTy);
3653 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3654
3655 auto *MaskTy = cast<VectorType>(Val: Mask->getType());
3656 int FromCount = length(Ty: MaskTy);
3657 int ToCount = (FromCount * FromSize) / ToSize;
3658 assert((FromCount * FromSize) % ToSize == 0);
3659
3660 auto *FromITy = getIntTy(Width: FromSize * 8);
3661 auto *ToITy = getIntTy(Width: ToSize * 8);
3662
3663 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3664 // -> trunc to <M x i1>.
3665 Value *Ext = Builder.CreateSExt(
3666 V: Mask, DestTy: VectorType::get(ElementType: FromITy, NumElements: FromCount, /*Scalable=*/false), Name: "sxt");
3667 Value *Cast = Builder.CreateBitCast(
3668 V: Ext, DestTy: VectorType::get(ElementType: ToITy, NumElements: ToCount, /*Scalable=*/false), Name: "cst");
3669 return Builder.CreateTrunc(
3670 V: Cast, DestTy: VectorType::get(ElementType: getBoolTy(), NumElements: ToCount, /*Scalable=*/false), Name: "trn");
3671}
3672
3673// Bitcast to bytes, and return least significant bits.
3674auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3675 -> Value * {
3676 Type *ScalarTy = Val->getType()->getScalarType();
3677 if (ScalarTy == getBoolTy())
3678 return Val;
3679
3680 Value *Bytes = vbytes(Builder, Val);
3681 if (auto *VecTy = dyn_cast<VectorType>(Val: Bytes->getType()))
3682 return Builder.CreateTrunc(V: Bytes, DestTy: getBoolTy(ElemCount: getSizeOf(Ty: VecTy)), Name: "trn");
3683 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3684 // <1 x i1>.
3685 return Builder.CreateTrunc(V: Bytes, DestTy: getBoolTy(), Name: "trn");
3686}
3687
3688// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3689auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3690 -> Value * {
3691 Type *ScalarTy = Val->getType()->getScalarType();
3692 if (ScalarTy == getByteTy())
3693 return Val;
3694
3695 if (ScalarTy != getBoolTy())
3696 return Builder.CreateBitCast(V: Val, DestTy: getByteTy(ElemCount: getSizeOf(Val)), Name: "cst");
3697 // For bool, return a sext from i1 to i8.
3698 if (auto *VecTy = dyn_cast<VectorType>(Val: Val->getType()))
3699 return Builder.CreateSExt(V: Val, DestTy: VectorType::get(ElementType: getByteTy(), Other: VecTy), Name: "sxt");
3700 return Builder.CreateSExt(V: Val, DestTy: getByteTy(), Name: "sxt");
3701}
3702
3703auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3704 unsigned Start, unsigned Length) const
3705 -> Value * {
3706 assert(Start + Length <= length(Val));
3707 return getElementRange(Builder, Lo: Val, /*Ignored*/ Hi: Val, Start, Length);
3708}
3709
3710auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3711 -> Value * {
3712 size_t Len = length(Val);
3713 assert(Len % 2 == 0 && "Length should be even");
3714 return subvector(Builder, Val, Start: 0, Length: Len / 2);
3715}
3716
3717auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3718 -> Value * {
3719 size_t Len = length(Val);
3720 assert(Len % 2 == 0 && "Length should be even");
3721 return subvector(Builder, Val, Start: Len / 2, Length: Len / 2);
3722}
3723
3724auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3725 Value *Val1) const -> Value * {
3726 assert(Val0->getType() == Val1->getType());
3727 int Len = length(Val: Val0);
3728 SmallVector<int, 128> Mask(2 * Len);
3729
3730 for (int i = 0; i != Len; ++i) {
3731 Mask[i] = 2 * i; // Even
3732 Mask[i + Len] = 2 * i + 1; // Odd
3733 }
3734 return Builder.CreateShuffleVector(V1: Val0, V2: Val1, Mask, Name: "shf");
3735}
3736
3737auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3738 Value *Val1) const -> Value * { //
3739 assert(Val0->getType() == Val1->getType());
3740 int Len = length(Val: Val0);
3741 SmallVector<int, 128> Mask(2 * Len);
3742
3743 for (int i = 0; i != Len; ++i) {
3744 Mask[2 * i + 0] = i; // Val0
3745 Mask[2 * i + 1] = i + Len; // Val1
3746 }
3747 return Builder.CreateShuffleVector(V1: Val0, V2: Val1, Mask, Name: "shf");
3748}
3749
3750auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3751 Intrinsic::ID IntID, Type *RetTy,
3752 ArrayRef<Value *> Args,
3753 ArrayRef<Type *> ArgTys,
3754 ArrayRef<Value *> MDSources) const
3755 -> Value * {
3756 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3757 Type *DestTy) -> Value * {
3758 Type *SrcTy = Val->getType();
3759 if (SrcTy == DestTy)
3760 return Val;
3761
3762 // Non-HVX type. It should be a scalar, and it should already have
3763 // a valid type.
3764 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3765
3766 Type *BoolTy = Type::getInt1Ty(C&: F.getContext());
3767 if (cast<VectorType>(Val: SrcTy)->getElementType() != BoolTy)
3768 return Builder.CreateBitCast(V: Val, DestTy, Name: "cst");
3769
3770 // Predicate HVX vector.
3771 unsigned HwLen = HST.getVectorLength();
3772 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3773 : Intrinsic::hexagon_V6_pred_typecast_128B;
3774 return Builder.CreateIntrinsic(ID: TC, OverloadTypes: {DestTy, Val->getType()}, Args: {Val},
3775 /*FMFSource=*/nullptr, Name: "cup");
3776 };
3777
3778 Function *IntrFn =
3779 Intrinsic::getOrInsertDeclaration(M: F.getParent(), id: IntID, OverloadTys: ArgTys);
3780 FunctionType *IntrTy = IntrFn->getFunctionType();
3781
3782 SmallVector<Value *, 4> IntrArgs;
3783 for (int i = 0, e = Args.size(); i != e; ++i) {
3784 Value *A = Args[i];
3785 Type *T = IntrTy->getParamType(i);
3786 if (A->getType() != T) {
3787 IntrArgs.push_back(Elt: getCast(Builder, A, T));
3788 } else {
3789 IntrArgs.push_back(Elt: A);
3790 }
3791 }
3792 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3793 CallInst *Call = Builder.CreateCall(Callee: IntrFn, Args: IntrArgs, Name: MaybeName);
3794
3795 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3796 if (!ME.doesNotAccessMemory() && !ME.onlyAccessesInaccessibleMem())
3797 propagateMetadata(I: Call, VL: MDSources);
3798
3799 Type *CallTy = Call->getType();
3800 if (RetTy == nullptr || CallTy == RetTy)
3801 return Call;
3802 // Scalar types should have RetTy matching the call return type.
3803 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3804 return getCast(Builder, Call, RetTy);
3805}
3806
3807auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3808 Value *Vec,
3809 unsigned ToWidth) const
3810 -> SmallVector<Value *> {
3811 // Break a vector of wide elements into a series of vectors with narrow
3812 // elements:
3813 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3814 // -->
3815 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3816 // (b0, b1, b2, ...) // the next lowest...
3817 // (c0, c1, c2, ...) // ...
3818 // ...
3819 //
3820 // The number of elements in each resulting vector is the same as
3821 // in the original vector.
3822
3823 auto *VecTy = cast<VectorType>(Val: Vec->getType());
3824 assert(VecTy->getElementType()->isIntegerTy());
3825 unsigned FromWidth = VecTy->getScalarSizeInBits();
3826 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3827 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3828 unsigned NumResults = FromWidth / ToWidth;
3829
3830 SmallVector<Value *> Results(NumResults);
3831 Results[0] = Vec;
3832 unsigned Length = length(Ty: VecTy);
3833
3834 // Do it by splitting in half, since those operations correspond to deal
3835 // instructions.
3836 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3837 // Take V = Results[Begin], split it in L, H.
3838 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3839 // Call itself recursively split(Begin, Half), split(Half+1, End)
3840 if (Begin + 1 == End)
3841 return;
3842
3843 Value *Val = Results[Begin];
3844 unsigned Width = Val->getType()->getScalarSizeInBits();
3845
3846 auto *VTy = VectorType::get(ElementType: getIntTy(Width: Width / 2), NumElements: 2 * Length, Scalable: false);
3847 Value *VVal = Builder.CreateBitCast(V: Val, DestTy: VTy, Name: "cst");
3848
3849 Value *Res = vdeal(Builder, Val0: sublo(Builder, Val: VVal), Val1: subhi(Builder, Val: VVal));
3850
3851 unsigned Half = (Begin + End) / 2;
3852 Results[Begin] = sublo(Builder, Val: Res);
3853 Results[Half] = subhi(Builder, Val: Res);
3854
3855 splitFunc(Begin, Half, splitFunc);
3856 splitFunc(Half, End, splitFunc);
3857 };
3858
3859 splitInHalf(0, NumResults, splitInHalf);
3860 return Results;
3861}
3862
3863auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3864 ArrayRef<Value *> Values,
3865 VectorType *ToType) const
3866 -> Value * {
3867 assert(ToType->getElementType()->isIntegerTy());
3868
3869 // If the list of values does not have power-of-2 elements, append copies
3870 // of the sign bit to it, to make the size be 2^n.
3871 // The reason for this is that the values will be joined in pairs, because
3872 // otherwise the shuffles will result in convoluted code. With pairwise
3873 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3874 // The output will need to be sign-extended to a type with element width
3875 // being a power-of-2 anyways.
3876 SmallVector<Value *> Inputs(Values);
3877
3878 unsigned ToWidth = ToType->getScalarSizeInBits();
3879 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3880 assert(Width <= ToWidth);
3881 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3882 unsigned Length = length(Ty: Inputs.front()->getType());
3883
3884 unsigned NeedInputs = ToWidth / Width;
3885 if (Inputs.size() != NeedInputs) {
3886 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3887 // If there are too few, fill them with the sign bit.
3888 Value *Last = Inputs.back();
3889 Value *Sign = Builder.CreateAShr(
3890 LHS: Last, RHS: ConstantInt::get(Ty: Last->getType(), V: Width - 1), Name: "asr");
3891 Inputs.resize(N: NeedInputs, NV: Sign);
3892 }
3893
3894 while (Inputs.size() > 1) {
3895 Width *= 2;
3896 auto *VTy = VectorType::get(ElementType: getIntTy(Width), NumElements: Length, Scalable: false);
3897 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3898 Value *Res = vshuff(Builder, Val0: Inputs[i], Val1: Inputs[i + 1]);
3899 Inputs[i / 2] = Builder.CreateBitCast(V: Res, DestTy: VTy, Name: "cst");
3900 }
3901 Inputs.resize(N: Inputs.size() / 2);
3902 }
3903
3904 assert(Inputs.front()->getType() == ToType);
3905 return Inputs.front();
3906}
3907
3908auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3909 Value *Ptr1) const
3910 -> std::optional<int> {
3911 // Try SCEV first.
3912 const SCEV *Scev0 = SE.getSCEV(V: Ptr0);
3913 const SCEV *Scev1 = SE.getSCEV(V: Ptr1);
3914 const SCEV *ScevDiff = SE.getMinusSCEV(LHS: Scev0, RHS: Scev1);
3915 if (auto *Const = dyn_cast<SCEVConstant>(Val: ScevDiff)) {
3916 APInt V = Const->getAPInt();
3917 if (V.isSignedIntN(N: 8 * sizeof(int)))
3918 return static_cast<int>(V.getSExtValue());
3919 }
3920
3921 struct Builder : IRBuilder<> {
3922 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3923 ~Builder() {
3924 for (Instruction *I : llvm::reverse(C&: ToErase))
3925 I->eraseFromParent();
3926 }
3927 SmallVector<Instruction *, 8> ToErase;
3928 };
3929
3930#define CallBuilder(B, F) \
3931 [&](auto &B_) { \
3932 Value *V = B_.F; \
3933 if (auto *I = dyn_cast<Instruction>(V)) \
3934 B_.ToErase.push_back(I); \
3935 return V; \
3936 }(B)
3937
3938 auto Simplify = [this](Value *V) {
3939 if (Value *S = simplify(V))
3940 return S;
3941 return V;
3942 };
3943
3944 auto StripBitCast = [](Value *V) {
3945 while (auto *C = dyn_cast<BitCastInst>(Val: V))
3946 V = C->getOperand(i_nocapture: 0);
3947 return V;
3948 };
3949
3950 Ptr0 = StripBitCast(Ptr0);
3951 Ptr1 = StripBitCast(Ptr1);
3952 if (!isa<GetElementPtrInst>(Val: Ptr0) || !isa<GetElementPtrInst>(Val: Ptr1))
3953 return std::nullopt;
3954
3955 auto *Gep0 = cast<GetElementPtrInst>(Val: Ptr0);
3956 auto *Gep1 = cast<GetElementPtrInst>(Val: Ptr1);
3957 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3958 return std::nullopt;
3959 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3960 return std::nullopt;
3961
3962 Builder B(Gep0->getParent());
3963 int Scale = getSizeOf(Ty: Gep0->getSourceElementType(), Kind: Alloc);
3964
3965 // FIXME: for now only check GEPs with a single index.
3966 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3967 return std::nullopt;
3968
3969 Value *Idx0 = Gep0->getOperand(i_nocapture: 1);
3970 Value *Idx1 = Gep1->getOperand(i_nocapture: 1);
3971
3972 // First, try to simplify the subtraction directly.
3973 if (auto *Diff = dyn_cast<ConstantInt>(
3974 Val: Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3975 return Diff->getSExtValue() * Scale;
3976
3977 KnownBits Known0 = getKnownBits(V: Idx0, CtxI: Gep0);
3978 KnownBits Known1 = getKnownBits(V: Idx1, CtxI: Gep1);
3979 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3980 if (Unknown.isAllOnes())
3981 return std::nullopt;
3982
3983 Value *MaskU = ConstantInt::get(Ty: Idx0->getType(), V: Unknown);
3984 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3985 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3986 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3987 int Diff0 = 0;
3988 if (auto *C = dyn_cast<ConstantInt>(Val: SubU)) {
3989 Diff0 = C->getSExtValue();
3990 } else {
3991 return std::nullopt;
3992 }
3993
3994 Value *MaskK = ConstantInt::get(Ty: MaskU->getType(), V: ~Unknown);
3995 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3996 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3997 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3998 int Diff1 = 0;
3999 if (auto *C = dyn_cast<ConstantInt>(Val: SubK)) {
4000 Diff1 = C->getSExtValue();
4001 } else {
4002 return std::nullopt;
4003 }
4004
4005 return (Diff0 + Diff1) * Scale;
4006
4007#undef CallBuilder
4008}
4009
4010auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
4011 const Instruction *CtxI) const
4012 -> unsigned {
4013 return ComputeMaxSignificantBits(Op: V, DL, AC: &AC, CxtI: CtxI, DT: &DT);
4014}
4015
4016auto HexagonVectorCombine::getKnownBits(const Value *V,
4017 const Instruction *CtxI) const
4018 -> KnownBits {
4019 return computeKnownBits(V, DL, AC: &AC, CxtI: CtxI, DT: &DT);
4020}
4021
4022auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
4023 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
4024 In.isFenceLike() || In.mayReadOrWriteMemory()) {
4025 return false;
4026 }
4027 if (isa<CallBase>(Val: In) || isa<AllocaInst>(Val: In))
4028 return false;
4029 return true;
4030}
4031
4032template <typename T>
4033auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
4034 BasicBlock::const_iterator To,
4035 const T &IgnoreInsts) const
4036 -> bool {
4037 auto getLocOrNone =
4038 [this](const Instruction &I) -> std::optional<MemoryLocation> {
4039 if (const auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
4040 switch (II->getIntrinsicID()) {
4041 case Intrinsic::masked_load:
4042 return MemoryLocation::getForArgument(Call: II, ArgIdx: 0, TLI);
4043 case Intrinsic::masked_store:
4044 return MemoryLocation::getForArgument(Call: II, ArgIdx: 1, TLI);
4045 }
4046 }
4047 return MemoryLocation::getOrNone(Inst: &I);
4048 };
4049
4050 // The source and the destination must be in the same basic block.
4051 const BasicBlock &Block = *In.getParent();
4052 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
4053 // No PHIs.
4054 if (isa<PHINode>(Val: In) || (To != Block.end() && isa<PHINode>(Val: *To)))
4055 return false;
4056
4057 if (!mayHaveNonDefUseDependency(I: In))
4058 return true;
4059 bool MayWrite = In.mayWriteToMemory();
4060 auto MaybeLoc = getLocOrNone(In);
4061
4062 auto From = In.getIterator();
4063 if (From == To)
4064 return true;
4065 bool MoveUp = (To != Block.end() && To->comesBefore(Other: &In));
4066 auto Range =
4067 MoveUp ? std::make_pair(x&: To, y&: From) : std::make_pair(x: std::next(x: From), y&: To);
4068 for (auto It = Range.first; It != Range.second; ++It) {
4069 const Instruction &I = *It;
4070 if (llvm::is_contained(IgnoreInsts, &I))
4071 continue;
4072 // assume intrinsic can be ignored
4073 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
4074 if (II->getIntrinsicID() == Intrinsic::assume)
4075 continue;
4076 }
4077 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
4078 if (I.mayThrow())
4079 return false;
4080 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
4081 if (!CB->hasFnAttr(Kind: Attribute::WillReturn))
4082 return false;
4083 if (!CB->hasFnAttr(Kind: Attribute::NoSync))
4084 return false;
4085 }
4086 if (I.mayReadOrWriteMemory()) {
4087 auto MaybeLocI = getLocOrNone(I);
4088 if (MayWrite || I.mayWriteToMemory()) {
4089 if (!MaybeLoc || !MaybeLocI)
4090 return false;
4091 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
4092 return false;
4093 }
4094 }
4095 }
4096 return true;
4097}
4098
4099auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
4100 if (auto *VecTy = dyn_cast<VectorType>(Val: Ty))
4101 return VecTy->getElementType() == getByteTy();
4102 return false;
4103}
4104
4105auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
4106 Value *Hi, int Start,
4107 int Length) const -> Value * {
4108 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
4109 SmallVector<int, 128> SMask(Length);
4110 std::iota(first: SMask.begin(), last: SMask.end(), value: Start);
4111 return Builder.CreateShuffleVector(V1: Lo, V2: Hi, Mask: SMask, Name: "shf");
4112}
4113
4114// Pass management.
4115
4116namespace {
4117class HexagonVectorCombineLegacy : public FunctionPass {
4118public:
4119 static char ID;
4120
4121 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
4122
4123 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
4124
4125 void getAnalysisUsage(AnalysisUsage &AU) const override {
4126 AU.setPreservesCFG();
4127 AU.addRequired<AAResultsWrapperPass>();
4128 AU.addRequired<AssumptionCacheTracker>();
4129 AU.addRequired<DominatorTreeWrapperPass>();
4130 AU.addRequired<ScalarEvolutionWrapperPass>();
4131 AU.addRequired<TargetLibraryInfoWrapperPass>();
4132 AU.addRequired<TargetPassConfig>();
4133 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
4134 FunctionPass::getAnalysisUsage(AU);
4135 }
4136
4137 bool runOnFunction(Function &F) override {
4138 if (skipFunction(F))
4139 return false;
4140 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
4141 AssumptionCache &AC =
4142 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4143 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4144 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4145 TargetLibraryInfo &TLI =
4146 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
4147 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
4148 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
4149 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM, ORE);
4150 return HVC.run();
4151 }
4152};
4153} // namespace
4154
4155char HexagonVectorCombineLegacy::ID = 0;
4156
4157INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
4158 "Hexagon Vector Combine", false, false)
4159INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
4160INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4161INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
4162INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
4163INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
4164INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
4165INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
4166INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
4167 "Hexagon Vector Combine", false, false)
4168
4169FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {
4170 return new HexagonVectorCombineLegacy();
4171}
4172