1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/MapVector.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/Analysis/AliasAnalysis.h"
22#include "llvm/Analysis/AssumeBundleQueries.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/InstSimplifyFolder.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/Analysis/TargetLibraryInfo.h"
28#include "llvm/Analysis/ValueTracking.h"
29#include "llvm/Analysis/VectorUtils.h"
30#include "llvm/CodeGen/TargetPassConfig.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/IRBuilder.h"
34#include "llvm/IR/IntrinsicInst.h"
35#include "llvm/IR/Intrinsics.h"
36#include "llvm/IR/IntrinsicsHexagon.h"
37#include "llvm/IR/Metadata.h"
38#include "llvm/IR/PatternMatch.h"
39#include "llvm/InitializePasses.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/KnownBits.h"
43#include "llvm/Support/MathExtras.h"
44#include "llvm/Support/raw_ostream.h"
45#include "llvm/Target/TargetMachine.h"
46#include "llvm/Transforms/Utils/Local.h"
47
48#include "Hexagon.h"
49#include "HexagonSubtarget.h"
50#include "HexagonTargetMachine.h"
51
52#include <algorithm>
53#include <deque>
54#include <map>
55#include <optional>
56#include <set>
57#include <utility>
58#include <vector>
59
60#define DEBUG_TYPE "hexagon-vc"
61
62// This is a const that represents default HVX VTCM page size.
63// It is boot time configurable, so we probably want an API to
64// read it, but for now assume 128KB
65#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
66
67using namespace llvm;
68
69namespace {
70cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
71cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(Val: true)); // Align
72cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(Val: true)); // Idioms
73cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
74
75cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
76 cl::init(Val: ~0));
77cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
78 cl::init(Val: ~0));
79cl::opt<unsigned>
80 MinLoadGroupSizeForAlignment("hvc-ld-min-group-size-for-alignment",
81 cl::Hidden, cl::init(Val: 4));
82
83class HexagonVectorCombine {
84public:
85 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
86 DominatorTree &DT_, ScalarEvolution &SE_,
87 TargetLibraryInfo &TLI_, const TargetMachine &TM_)
88 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
89 SE(SE_), TLI(TLI_),
90 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
91
92 bool run();
93
94 // Common integer type.
95 IntegerType *getIntTy(unsigned Width = 32) const;
96 // Byte type: either scalar (when Length = 0), or vector with given
97 // element count.
98 Type *getByteTy(int ElemCount = 0) const;
99 // Boolean type: either scalar (when Length = 0), or vector with given
100 // element count.
101 Type *getBoolTy(int ElemCount = 0) const;
102 // Create a ConstantInt of type returned by getIntTy with the value Val.
103 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
104 // Get the integer value of V, if it exists.
105 std::optional<APInt> getIntValue(const Value *Val) const;
106 // Is Val a constant 0, or a vector of 0s?
107 bool isZero(const Value *Val) const;
108 // Is Val an undef value?
109 bool isUndef(const Value *Val) const;
110 // Is Val a scalar (i1 true) or a vector of (i1 true)?
111 bool isTrue(const Value *Val) const;
112 // Is Val a scalar (i1 false) or a vector of (i1 false)?
113 bool isFalse(const Value *Val) const;
114
115 // Get HVX vector type with the given element type.
116 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
117
118 enum SizeKind {
119 Store, // Store size
120 Alloc, // Alloc size
121 };
122 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
123 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
124 int getTypeAlignment(Type *Ty) const;
125 size_t length(Value *Val) const;
126 size_t length(Type *Ty) const;
127
128 Value *simplify(Value *Val) const;
129
130 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
131 int Length, int Where) const;
132 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
133 Value *Amt) const;
134 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
135 Value *Amt) const;
136 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
137 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
138 Value *Pad) const;
139 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
140 Type *ToTy) const;
141 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
142 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
143 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
144 unsigned Length) const;
145 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
146 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
147 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
148 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
149
150 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
151 Type *RetTy, ArrayRef<Value *> Args,
152 ArrayRef<Type *> ArgTys = {},
153 ArrayRef<Value *> MDSources = {}) const;
154 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
155 unsigned ToWidth) const;
156 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
157 VectorType *ToType) const;
158
159 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
160
161 unsigned getNumSignificantBits(const Value *V,
162 const Instruction *CtxI = nullptr) const;
163 KnownBits getKnownBits(const Value *V,
164 const Instruction *CtxI = nullptr) const;
165
166 bool isSafeToClone(const Instruction &In) const;
167
168 template <typename T = std::vector<Instruction *>>
169 bool isSafeToMoveBeforeInBB(const Instruction &In,
170 BasicBlock::const_iterator To,
171 const T &IgnoreInsts = {}) const;
172
173 // This function is only used for assertions at the moment.
174 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
175
176 Function &F;
177 const DataLayout &DL;
178 AliasAnalysis &AA;
179 AssumptionCache &AC;
180 DominatorTree &DT;
181 ScalarEvolution &SE;
182 TargetLibraryInfo &TLI;
183 const HexagonSubtarget &HST;
184
185private:
186 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
187 int Start, int Length) const;
188};
189
190class AlignVectors {
191 // This code tries to replace unaligned vector loads/stores with aligned
192 // ones.
193 // Consider unaligned load:
194 // %v = original_load %some_addr, align <bad>
195 // %user = %v
196 // It will generate
197 // = load ..., align <good>
198 // = load ..., align <good>
199 // = valign
200 // etc.
201 // %synthesize = combine/shuffle the loaded data so that it looks
202 // exactly like what "original_load" has loaded.
203 // %user = %synthesize
204 // Similarly for stores.
205public:
206 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
207
208 bool run();
209
210private:
211 using InstList = std::vector<Instruction *>;
212 using InstMap = DenseMap<Instruction *, Instruction *>;
213
214 struct AddrInfo {
215 AddrInfo(const AddrInfo &) = default;
216 AddrInfo &operator=(const AddrInfo &) = default;
217 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
218 Align H)
219 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
220 NeedAlign(HVC.getTypeAlignment(Ty: ValTy)) {}
221
222 // XXX: add Size member?
223 Instruction *Inst;
224 Value *Addr;
225 Type *ValTy;
226 Align HaveAlign;
227 Align NeedAlign;
228 int Offset = 0; // Offset (in bytes) from the first member of the
229 // containing AddrList.
230 };
231 using AddrList = std::vector<AddrInfo>;
232
233 struct InstrLess {
234 bool operator()(const Instruction *A, const Instruction *B) const {
235 return A->comesBefore(Other: B);
236 }
237 };
238 using DepList = std::set<Instruction *, InstrLess>;
239
240 struct MoveGroup {
241 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
242 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
243 MoveGroup() = default;
244 Instruction *Base; // Base instruction of the parent address group.
245 InstList Main; // Main group of instructions.
246 InstList Deps; // List of dependencies.
247 InstMap Clones; // Map from original Deps to cloned ones.
248 bool IsHvx; // Is this group of HVX instructions?
249 bool IsLoad; // Is this a load group?
250 };
251 using MoveList = std::vector<MoveGroup>;
252
253 struct ByteSpan {
254 // A representation of "interesting" bytes within a given span of memory.
255 // These bytes are those that are loaded or stored, and they don't have
256 // to cover the entire span of memory.
257 //
258 // The representation works by picking a contiguous sequence of bytes
259 // from somewhere within a llvm::Value, and placing it at a given offset
260 // within the span.
261 //
262 // The sequence of bytes from llvm:Value is represented by Segment.
263 // Block is Segment, plus where it goes in the span.
264 //
265 // An important feature of ByteSpan is being able to make a "section",
266 // i.e. creating another ByteSpan corresponding to a range of offsets
267 // relative to the source span.
268
269 struct Segment {
270 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
271 Segment(Value *Val, int Begin, int Len)
272 : Val(Val), Start(Begin), Size(Len) {}
273 Segment(const Segment &Seg) = default;
274 Segment &operator=(const Segment &Seg) = default;
275 Value *Val; // Value representable as a sequence of bytes.
276 int Start; // First byte of the value that belongs to the segment.
277 int Size; // Number of bytes in the segment.
278 };
279
280 struct Block {
281 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
282 Block(Value *Val, int Off, int Len, int Pos)
283 : Seg(Val, Off, Len), Pos(Pos) {}
284 Block(const Block &Blk) = default;
285 Block &operator=(const Block &Blk) = default;
286 Segment Seg; // Value segment.
287 int Pos; // Position (offset) of the block in the span.
288 };
289
290 int extent() const;
291 ByteSpan section(int Start, int Length) const;
292 ByteSpan &shift(int Offset);
293 SmallVector<Value *, 8> values() const;
294
295 int size() const { return Blocks.size(); }
296 Block &operator[](int i) { return Blocks[i]; }
297 const Block &operator[](int i) const { return Blocks[i]; }
298
299 std::vector<Block> Blocks;
300
301 using iterator = decltype(Blocks)::iterator;
302 iterator begin() { return Blocks.begin(); }
303 iterator end() { return Blocks.end(); }
304 using const_iterator = decltype(Blocks)::const_iterator;
305 const_iterator begin() const { return Blocks.begin(); }
306 const_iterator end() const { return Blocks.end(); }
307 };
308
309 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
310 bool isHvx(const AddrInfo &AI) const;
311 // This function is only used for assertions at the moment.
312 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
313
314 Value *getPayload(Value *Val) const;
315 Value *getMask(Value *Val) const;
316 Value *getPassThrough(Value *Val) const;
317
318 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
319 int Adjust,
320 const InstMap &CloneMap = InstMap()) const;
321 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
322 int Alignment,
323 const InstMap &CloneMap = InstMap()) const;
324
325 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
326 Value *Predicate, int Alignment, Value *Mask,
327 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
328 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
329 int Alignment,
330 ArrayRef<Value *> MDSources = {}) const;
331
332 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
333 Value *Predicate, int Alignment, Value *Mask,
334 ArrayRef<Value *> MDSources = {}) const;
335 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
336 int Alignment,
337 ArrayRef<Value *> MDSources = {}) const;
338
339 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
340 Value *Predicate, int Alignment,
341 ArrayRef<Value *> MDSources = {}) const;
342 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
343 Value *Predicate, int Alignment,
344 ArrayRef<Value *> MDSources = {}) const;
345
346 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
347 bool createAddressGroups();
348 MoveList createLoadGroups(const AddrList &Group) const;
349 MoveList createStoreGroups(const AddrList &Group) const;
350 bool moveTogether(MoveGroup &Move) const;
351 template <typename T>
352 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
353
354 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
355 int ScLen, Value *AlignVal, Value *AlignAddr) const;
356 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
357 int ScLen, Value *AlignVal, Value *AlignAddr) const;
358 bool realignGroup(const MoveGroup &Move);
359 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
360 int Alignment) const;
361
362 using AddrGroupMap = MapVector<Instruction *, AddrList>;
363 AddrGroupMap AddrGroups;
364
365 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
366 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
367 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
368 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
369 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
370 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
371 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
372 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
373 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
374 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
375 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
376 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
377 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
378 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
379
380 const HexagonVectorCombine &HVC;
381};
382
383[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
384 const AlignVectors::AddrGroupMap &AG) {
385 OS << "Printing AddrGroups:"
386 << "\n";
387 for (auto &It : AG) {
388 OS << "\n\tInstruction: ";
389 It.first->dump();
390 OS << "\n\tAddrInfo: ";
391 for (auto &AI : It.second)
392 OS << AI << "\n";
393 }
394 return OS;
395}
396
397[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
398 const AlignVectors::AddrList &AL) {
399 OS << "\n *** Addr List: ***\n";
400 for (auto &AG : AL) {
401 OS << "\n *** Addr Group: ***\n";
402 OS << AG;
403 OS << "\n";
404 }
405 return OS;
406}
407
408[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
409 const AlignVectors::AddrInfo &AI) {
410 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
411 OS << "Addr: " << *AI.Addr << '\n';
412 OS << "Type: " << *AI.ValTy << '\n';
413 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
414 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
415 OS << "Offset: " << AI.Offset;
416 return OS;
417}
418
419[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
420 const AlignVectors::MoveList &ML) {
421 OS << "\n *** Move List: ***\n";
422 for (auto &MG : ML) {
423 OS << "\n *** Move Group: ***\n";
424 OS << MG;
425 OS << "\n";
426 }
427 return OS;
428}
429
430[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
431 const AlignVectors::MoveGroup &MG) {
432 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
433 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
434 OS << "Main\n";
435 for (Instruction *I : MG.Main)
436 OS << " " << *I << '\n';
437 OS << "Deps\n";
438 for (Instruction *I : MG.Deps)
439 OS << " " << *I << '\n';
440 OS << "Clones\n";
441 for (auto [K, V] : MG.Clones) {
442 OS << " ";
443 K->printAsOperand(O&: OS, PrintType: false);
444 OS << "\t-> " << *V << '\n';
445 }
446 return OS;
447}
448
449[[maybe_unused]] raw_ostream &
450operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
451 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
452 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
453 OS << "(self:" << B.Seg.Val << ')';
454 } else if (B.Seg.Val != nullptr) {
455 OS << *B.Seg.Val;
456 } else {
457 OS << "(null)";
458 }
459 return OS;
460}
461
462[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
463 const AlignVectors::ByteSpan &BS) {
464 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
465 for (const AlignVectors::ByteSpan::Block &B : BS)
466 OS << B << '\n';
467 OS << ']';
468 return OS;
469}
470
471class HvxIdioms {
472public:
473 enum DstQualifier {
474 Undefined = 0,
475 Arithmetic,
476 LdSt,
477 LLVM_Gather,
478 LLVM_Scatter,
479 HEX_Gather_Scatter,
480 HEX_Gather,
481 HEX_Scatter,
482 Call
483 };
484
485 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
486 auto *Int32Ty = HVC.getIntTy(Width: 32);
487 HvxI32Ty = HVC.getHvxTy(ElemTy: Int32Ty, /*Pair=*/false);
488 HvxP32Ty = HVC.getHvxTy(ElemTy: Int32Ty, /*Pair=*/true);
489 }
490
491 bool run();
492
493private:
494 enum Signedness { Positive, Signed, Unsigned };
495
496 // Value + sign
497 // This is to keep track of whether the value should be treated as signed
498 // or unsigned, or is known to be positive.
499 struct SValue {
500 Value *Val;
501 Signedness Sgn;
502 };
503
504 struct FxpOp {
505 unsigned Opcode;
506 unsigned Frac; // Number of fraction bits
507 SValue X, Y;
508 // If present, add 1 << RoundAt before shift:
509 std::optional<unsigned> RoundAt;
510 VectorType *ResTy;
511 };
512
513 auto getNumSignificantBits(Value *V, Instruction *In) const
514 -> std::pair<unsigned, Signedness>;
515 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
516
517 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
518 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
519
520 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
521 const FxpOp &Op) const -> Value *;
522 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
523 bool Rounding) const -> Value *;
524 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
525 bool Rounding) const -> Value *;
526 // Return {Result, Carry}, where Carry is a vector predicate.
527 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
528 Value *CarryIn = nullptr) const
529 -> std::pair<Value *, Value *>;
530 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
531 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
532 -> Value *;
533 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
534 -> std::pair<Value *, Value *>;
535 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
536 ArrayRef<Value *> WordY) const -> SmallVector<Value *>;
537 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
538 Signedness SgnX, ArrayRef<Value *> WordY,
539 Signedness SgnY) const -> SmallVector<Value *>;
540
541 bool matchMLoad(Instruction &In) const;
542 bool matchMStore(Instruction &In) const;
543 Value *processMLoad(Instruction &In) const;
544 Value *processMStore(Instruction &In) const;
545 std::optional<uint64_t> getAlignment(Instruction &In, Value *ptr) const;
546 std::optional<uint64_t>
547 getAlignmentImpl(Instruction &In, Value *ptr,
548 SmallPtrSet<Value *, 16> &Visited) const;
549 std::optional<uint64_t> getPHIBaseMinAlignment(Instruction &In,
550 PHINode *PN) const;
551
552 // Vector manipulations for Ripple
553 bool matchScatter(Instruction &In) const;
554 bool matchGather(Instruction &In) const;
555 Value *processVScatter(Instruction &In) const;
556 Value *processVGather(Instruction &In) const;
557
558 VectorType *HvxI32Ty;
559 VectorType *HvxP32Ty;
560 const HexagonVectorCombine &HVC;
561
562 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
563};
564
565[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
566 const HvxIdioms::FxpOp &Op) {
567 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
568 OS << Instruction::getOpcodeName(Opcode: Op.Opcode) << '.' << Op.Frac;
569 if (Op.RoundAt.has_value()) {
570 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
571 OS << ":rnd";
572 } else {
573 OS << " + 1<<" << *Op.RoundAt;
574 }
575 }
576 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
577 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
578 return OS;
579}
580
581} // namespace
582
583namespace {
584
585template <typename T> T *getIfUnordered(T *MaybeT) {
586 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
587}
588template <typename T> T *isCandidate(Instruction *In) {
589 return dyn_cast<T>(In);
590}
591template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
592 return getIfUnordered(MaybeT: dyn_cast<LoadInst>(Val: In));
593}
594template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
595 return getIfUnordered(MaybeT: dyn_cast<StoreInst>(Val: In));
596}
597
598// Forward other erase_ifs to the LLVM implementations.
599template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
600 llvm::erase_if(std::forward<T>(container), p);
601}
602
603} // namespace
604
605// --- Begin AlignVectors
606
607// For brevity, only consider loads. We identify a group of loads where we
608// know the relative differences between their addresses, so we know how they
609// are laid out in memory (relative to one another). These loads can overlap,
610// can be shorter or longer than the desired vector length.
611// Ultimately we want to generate a sequence of aligned loads that will load
612// every byte that the original loads loaded, and have the program use these
613// loaded values instead of the original loads.
614// We consider the contiguous memory area spanned by all these loads.
615//
616// Let's say that a single aligned vector load can load 16 bytes at a time.
617// If the program wanted to use a byte at offset 13 from the beginning of the
618// original span, it will be a byte at offset 13+x in the aligned data for
619// some x>=0. This may happen to be in the first aligned load, or in the load
620// following it. Since we generally don't know what the that alignment value
621// is at compile time, we proactively do valigns on the aligned loads, so that
622// byte that was at offset 13 is still at offset 13 after the valigns.
623//
624// This will be the starting point for making the rest of the program use the
625// data loaded by the new loads.
626// For each original load, and its users:
627// %v = load ...
628// ... = %v
629// ... = %v
630// we create
631// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
632// it contains the same value as %v did before
633// then replace all users of %v with %new_v.
634// ... = %new_v
635// ... = %new_v
636
637auto AlignVectors::ByteSpan::extent() const -> int {
638 if (size() == 0)
639 return 0;
640 int Min = Blocks[0].Pos;
641 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
642 for (int i = 1, e = size(); i != e; ++i) {
643 Min = std::min(a: Min, b: Blocks[i].Pos);
644 Max = std::max(a: Max, b: Blocks[i].Pos + Blocks[i].Seg.Size);
645 }
646 return Max - Min;
647}
648
649auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
650 ByteSpan Section;
651 for (const ByteSpan::Block &B : Blocks) {
652 int L = std::max(a: B.Pos, b: Start); // Left end.
653 int R = std::min(a: B.Pos + B.Seg.Size, b: Start + Length); // Right end+1.
654 if (L < R) {
655 // How much to chop off the beginning of the segment:
656 int Off = L > B.Pos ? L - B.Pos : 0;
657 Section.Blocks.emplace_back(args: B.Seg.Val, args: B.Seg.Start + Off, args: R - L, args&: L);
658 }
659 }
660 return Section;
661}
662
663auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
664 for (Block &B : Blocks)
665 B.Pos += Offset;
666 return *this;
667}
668
669auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
670 SmallVector<Value *, 8> Values(Blocks.size());
671 for (int i = 0, e = Blocks.size(); i != e; ++i)
672 Values[i] = Blocks[i].Seg.Val;
673 return Values;
674}
675
676// Turn a requested integer alignment into the effective Align to use.
677// If Requested == 0 -> use ABI alignment of the value type (old semantics).
678// 0 means "ABI alignment" in old IR.
679static Align effectiveAlignForValueTy(const DataLayout &DL, Type *ValTy,
680 int Requested) {
681 if (Requested > 0)
682 return Align(static_cast<uint64_t>(Requested));
683 return Align(DL.getABITypeAlign(Ty: ValTy).value());
684}
685
686auto AlignVectors::getAddrInfo(Instruction &In) const
687 -> std::optional<AddrInfo> {
688 if (auto *L = isCandidate<LoadInst>(In: &In))
689 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
690 L->getAlign());
691 if (auto *S = isCandidate<StoreInst>(In: &In))
692 return AddrInfo(HVC, S, S->getPointerOperand(),
693 S->getValueOperand()->getType(), S->getAlign());
694 if (auto *II = isCandidate<IntrinsicInst>(In: &In)) {
695 Intrinsic::ID ID = II->getIntrinsicID();
696 switch (ID) {
697 case Intrinsic::masked_load:
698 return AddrInfo(HVC, II, II->getArgOperand(i: 0), II->getType(),
699 II->getParamAlign(ArgNo: 0).valueOrOne());
700 case Intrinsic::masked_store:
701 return AddrInfo(HVC, II, II->getArgOperand(i: 1),
702 II->getArgOperand(i: 0)->getType(),
703 II->getParamAlign(ArgNo: 1).valueOrOne());
704 }
705 }
706 return std::nullopt;
707}
708
709auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
710 return HVC.HST.isTypeForHVX(VecTy: AI.ValTy);
711}
712
713auto AlignVectors::getPayload(Value *Val) const -> Value * {
714 if (auto *In = dyn_cast<Instruction>(Val)) {
715 Intrinsic::ID ID = 0;
716 if (auto *II = dyn_cast<IntrinsicInst>(Val: In))
717 ID = II->getIntrinsicID();
718 if (isa<StoreInst>(Val: In) || ID == Intrinsic::masked_store)
719 return In->getOperand(i: 0);
720 }
721 return Val;
722}
723
724auto AlignVectors::getMask(Value *Val) const -> Value * {
725 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
726 switch (II->getIntrinsicID()) {
727 case Intrinsic::masked_load:
728 return II->getArgOperand(i: 1);
729 case Intrinsic::masked_store:
730 return II->getArgOperand(i: 2);
731 }
732 }
733
734 Type *ValTy = getPayload(Val)->getType();
735 if (auto *VecTy = dyn_cast<VectorType>(Val: ValTy))
736 return Constant::getAllOnesValue(Ty: HVC.getBoolTy(ElemCount: HVC.length(Ty: VecTy)));
737 return Constant::getAllOnesValue(Ty: HVC.getBoolTy());
738}
739
740auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
741 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
742 if (II->getIntrinsicID() == Intrinsic::masked_load)
743 return II->getArgOperand(i: 2);
744 }
745 return UndefValue::get(T: getPayload(Val)->getType());
746}
747
748auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
749 Type *ValTy, int Adjust,
750 const InstMap &CloneMap) const
751 -> Value * {
752 if (auto *I = dyn_cast<Instruction>(Val: Ptr))
753 if (Instruction *New = CloneMap.lookup(Val: I))
754 Ptr = New;
755 return Builder.CreatePtrAdd(Ptr, Offset: HVC.getConstInt(Val: Adjust), Name: "gep");
756}
757
758auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
759 Type *ValTy, int Alignment,
760 const InstMap &CloneMap) const
761 -> Value * {
762 auto remap = [&](Value *V) -> Value * {
763 if (auto *I = dyn_cast<Instruction>(Val: V)) {
764 for (auto [Old, New] : CloneMap)
765 I->replaceUsesOfWith(From: Old, To: New);
766 return I;
767 }
768 return V;
769 };
770 Value *AsInt = Builder.CreatePtrToInt(V: Ptr, DestTy: HVC.getIntTy(), Name: "pti");
771 Value *Mask = HVC.getConstInt(Val: -Alignment);
772 Value *And = Builder.CreateAnd(LHS: remap(AsInt), RHS: Mask, Name: "and");
773 return Builder.CreateIntToPtr(
774 V: And, DestTy: PointerType::getUnqual(C&: ValTy->getContext()), Name: "itp");
775}
776
777auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
778 Value *Predicate, int Alignment, Value *Mask,
779 Value *PassThru,
780 ArrayRef<Value *> MDSources) const -> Value * {
781 // Predicate is nullptr if not creating predicated load
782 if (Predicate) {
783 assert(!Predicate->getType()->isVectorTy() &&
784 "Expectning scalar predicate");
785 if (HVC.isFalse(Val: Predicate))
786 return UndefValue::get(T: ValTy);
787 if (!HVC.isTrue(Val: Predicate)) {
788 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
789 Alignment, MDSources);
790 return Builder.CreateSelect(C: Mask, True: Load, False: PassThru);
791 }
792 // Predicate == true here.
793 }
794 assert(!HVC.isUndef(Mask)); // Should this be allowed?
795 if (HVC.isZero(Val: Mask))
796 return PassThru;
797
798 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy, Requested: Alignment);
799 if (HVC.isTrue(Val: Mask))
800 return createSimpleLoad(Builder, ValTy, Ptr, Alignment: EffA.value(), MDSources);
801
802 Instruction *Load =
803 Builder.CreateMaskedLoad(Ty: ValTy, Ptr, Alignment: EffA, Mask, PassThru, Name: "mld");
804 LLVM_DEBUG(dbgs() << "\t[Creating masked Load:] "; Load->dump());
805 propagateMetadata(I: Load, VL: MDSources);
806 return Load;
807}
808
809auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
810 Value *Ptr, int Alignment,
811 ArrayRef<Value *> MDSources) const
812 -> Value * {
813 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy, Requested: Alignment);
814 Instruction *Load = Builder.CreateAlignedLoad(Ty: ValTy, Ptr, Align: EffA, Name: "ald");
815 propagateMetadata(I: Load, VL: MDSources);
816 LLVM_DEBUG(dbgs() << "\t[Creating Load:] "; Load->dump());
817 return Load;
818}
819
820auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
821 Value *Ptr, Value *Predicate,
822 int Alignment,
823 ArrayRef<Value *> MDSources) const
824 -> Value * {
825 assert(HVC.HST.isTypeForHVX(ValTy) &&
826 "Predicates 'scalar' vector loads not yet supported");
827 assert(Predicate);
828 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
829 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy, Requested: Alignment);
830 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % EffA.value() == 0);
831
832 if (HVC.isFalse(Val: Predicate))
833 return UndefValue::get(T: ValTy);
834 if (HVC.isTrue(Val: Predicate))
835 return createSimpleLoad(Builder, ValTy, Ptr, Alignment: EffA.value(), MDSources);
836
837 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vL32b_pred_ai);
838 // FIXME: This may not put the offset from Ptr into the vmem offset.
839 return HVC.createHvxIntrinsic(Builder, IntID: V6_vL32b_pred_ai, RetTy: ValTy,
840 Args: {Predicate, Ptr, HVC.getConstInt(Val: 0)}, ArgTys: {},
841 MDSources);
842}
843
844auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
845 Value *Predicate, int Alignment, Value *Mask,
846 ArrayRef<Value *> MDSources) const -> Value * {
847 if (HVC.isZero(Val: Mask) || HVC.isUndef(Val) || HVC.isUndef(Val: Mask))
848 return UndefValue::get(T: Val->getType());
849 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
850 "Expectning scalar predicate"));
851 if (Predicate) {
852 if (HVC.isFalse(Val: Predicate))
853 return UndefValue::get(T: Val->getType());
854 if (HVC.isTrue(Val: Predicate))
855 Predicate = nullptr;
856 }
857 // Here both Predicate and Mask are true or unknown.
858
859 if (HVC.isTrue(Val: Mask)) {
860 if (Predicate) { // Predicate unknown
861 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
862 MDSources);
863 }
864 // Predicate is true:
865 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
866 }
867
868 // Mask is unknown
869 if (!Predicate) {
870 Instruction *Store =
871 Builder.CreateMaskedStore(Val, Ptr, Alignment: Align(Alignment), Mask);
872 propagateMetadata(I: Store, VL: MDSources);
873 return Store;
874 }
875
876 // Both Predicate and Mask are unknown.
877 // Emulate masked store with predicated-load + mux + predicated-store.
878 Value *PredLoad = createPredicatedLoad(Builder, ValTy: Val->getType(), Ptr,
879 Predicate, Alignment, MDSources);
880 Value *Mux = Builder.CreateSelect(C: Mask, True: Val, False: PredLoad);
881 return createPredicatedStore(Builder, Val: Mux, Ptr, Predicate, Alignment,
882 MDSources);
883}
884
885auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
886 Value *Ptr, int Alignment,
887 ArrayRef<Value *> MDSources) const
888 -> Value * {
889 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy: Val->getType(), Requested: Alignment);
890 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align: EffA);
891 LLVM_DEBUG(dbgs() << "\t[Creating store:] "; Store->dump());
892 propagateMetadata(I: Store, VL: MDSources);
893 return Store;
894}
895
896auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
897 Value *Ptr, Value *Predicate,
898 int Alignment,
899 ArrayRef<Value *> MDSources) const
900 -> Value * {
901 Align EffA = effectiveAlignForValueTy(DL: HVC.DL, ValTy: Val->getType(), Requested: Alignment);
902 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
903 "Predicates 'scalar' vector stores not yet supported");
904 assert(Predicate);
905 if (HVC.isFalse(Val: Predicate))
906 return UndefValue::get(T: Val->getType());
907 if (HVC.isTrue(Val: Predicate))
908 return createSimpleStore(Builder, Val, Ptr, Alignment: EffA.value(), MDSources);
909
910 assert(HVC.getSizeOf(Val, HVC.Alloc) % EffA.value() == 0);
911 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vS32b_pred_ai);
912 // FIXME: This may not put the offset from Ptr into the vmem offset.
913 return HVC.createHvxIntrinsic(Builder, IntID: V6_vS32b_pred_ai, RetTy: nullptr,
914 Args: {Predicate, Ptr, HVC.getConstInt(Val: 0), Val}, ArgTys: {},
915 MDSources);
916}
917
918auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
919 -> DepList {
920 BasicBlock *Parent = Base->getParent();
921 assert(In->getParent() == Parent &&
922 "Base and In should be in the same block");
923 assert(Base->comesBefore(In) && "Base should come before In");
924
925 DepList Deps;
926 std::deque<Instruction *> WorkQ = {In};
927 while (!WorkQ.empty()) {
928 Instruction *D = WorkQ.front();
929 WorkQ.pop_front();
930 if (D != In)
931 Deps.insert(x: D);
932 for (Value *Op : D->operands()) {
933 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
934 if (I->getParent() == Parent && Base->comesBefore(Other: I))
935 WorkQ.push_back(x: I);
936 }
937 }
938 }
939 return Deps;
940}
941
942auto AlignVectors::createAddressGroups() -> bool {
943 // An address group created here may contain instructions spanning
944 // multiple basic blocks.
945 AddrList WorkStack;
946
947 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
948 for (AddrInfo &W : WorkStack) {
949 if (auto D = HVC.calculatePointerDifference(Ptr0: AI.Addr, Ptr1: W.Addr))
950 return std::make_pair(x&: W.Inst, y&: *D);
951 }
952 return std::make_pair(x: nullptr, y: 0);
953 };
954
955 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
956 BasicBlock &Block = *DomN->getBlock();
957 for (Instruction &I : Block) {
958 auto AI = this->getAddrInfo(In&: I); // Use this-> for gcc6.
959 if (!AI)
960 continue;
961 auto F = findBaseAndOffset(*AI);
962 Instruction *GroupInst;
963 if (Instruction *BI = F.first) {
964 AI->Offset = F.second;
965 GroupInst = BI;
966 } else {
967 WorkStack.push_back(x: *AI);
968 GroupInst = AI->Inst;
969 }
970 AddrGroups[GroupInst].push_back(x: *AI);
971 }
972
973 for (DomTreeNode *C : DomN->children())
974 Visit(C, Visit);
975
976 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
977 WorkStack.pop_back();
978 };
979
980 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
981 assert(WorkStack.empty());
982
983 // AddrGroups are formed.
984 // Remove groups of size 1.
985 AddrGroups.remove_if(Pred: [](auto &G) { return G.second.size() == 1; });
986 // Remove groups that don't use HVX types.
987 AddrGroups.remove_if(Pred: [&](auto &G) {
988 return llvm::none_of(
989 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(VecTy: I.ValTy); });
990 });
991
992 LLVM_DEBUG(dbgs() << AddrGroups);
993 return !AddrGroups.empty();
994}
995
996auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
997 // Form load groups.
998 // To avoid complications with moving code across basic blocks, only form
999 // groups that are contained within a single basic block.
1000 unsigned SizeLimit = VAGroupSizeLimit;
1001 if (SizeLimit == 0)
1002 return {};
1003
1004 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1005 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1006 if (Move.Main.size() >= SizeLimit)
1007 return false;
1008 // Don't mix HVX and non-HVX instructions.
1009 if (Move.IsHvx != isHvx(AI: Info))
1010 return false;
1011 // Leading instruction in the load group.
1012 Instruction *Base = Move.Main.front();
1013 if (Base->getParent() != Info.Inst->getParent())
1014 return false;
1015 // Check if it's safe to move the load.
1016 if (!HVC.isSafeToMoveBeforeInBB(In: *Info.Inst, To: Base->getIterator()))
1017 return false;
1018 // And if it's safe to clone the dependencies.
1019 auto isSafeToCopyAtBase = [&](const Instruction *I) {
1020 return HVC.isSafeToMoveBeforeInBB(In: *I, To: Base->getIterator()) &&
1021 HVC.isSafeToClone(In: *I);
1022 };
1023 DepList Deps = getUpwardDeps(In: Info.Inst, Base);
1024 if (!llvm::all_of(Range&: Deps, P: isSafeToCopyAtBase))
1025 return false;
1026
1027 Move.Main.push_back(x: Info.Inst);
1028 llvm::append_range(C&: Move.Deps, R&: Deps);
1029 return true;
1030 };
1031
1032 MoveList LoadGroups;
1033
1034 for (const AddrInfo &Info : Group) {
1035 if (!Info.Inst->mayReadFromMemory())
1036 continue;
1037 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
1038 LoadGroups.emplace_back(args: Info, args: Group.front().Inst, args: isHvx(AI: Info), args: true);
1039 }
1040
1041 // Erase groups smaller than the minimum load group size.
1042 unsigned LoadGroupSizeLimit = MinLoadGroupSizeForAlignment;
1043 erase_if(container&: LoadGroups, p: [LoadGroupSizeLimit](const MoveGroup &G) {
1044 return G.Main.size() < LoadGroupSizeLimit;
1045 });
1046
1047 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1048 if (!HVC.HST.useHVXV62Ops())
1049 erase_if(container&: LoadGroups, p: [](const MoveGroup &G) { return G.IsHvx; });
1050
1051 LLVM_DEBUG(dbgs() << "LoadGroups list: " << LoadGroups);
1052 return LoadGroups;
1053}
1054
1055auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
1056 // Form store groups.
1057 // To avoid complications with moving code across basic blocks, only form
1058 // groups that are contained within a single basic block.
1059 unsigned SizeLimit = VAGroupSizeLimit;
1060 if (SizeLimit == 0)
1061 return {};
1062
1063 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1064 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1065 if (Move.Main.size() >= SizeLimit)
1066 return false;
1067 // For stores with return values we'd have to collect downward dependencies.
1068 // There are no such stores that we handle at the moment, so omit that.
1069 assert(Info.Inst->getType()->isVoidTy() &&
1070 "Not handling stores with return values");
1071 // Don't mix HVX and non-HVX instructions.
1072 if (Move.IsHvx != isHvx(AI: Info))
1073 return false;
1074 // For stores we need to be careful whether it's safe to move them.
1075 // Stores that are otherwise safe to move together may not appear safe
1076 // to move over one another (i.e. isSafeToMoveBefore may return false).
1077 Instruction *Base = Move.Main.front();
1078 if (Base->getParent() != Info.Inst->getParent())
1079 return false;
1080 if (!HVC.isSafeToMoveBeforeInBB(In: *Info.Inst, To: Base->getIterator(), IgnoreInsts: Move.Main))
1081 return false;
1082 Move.Main.push_back(x: Info.Inst);
1083 return true;
1084 };
1085
1086 MoveList StoreGroups;
1087
1088 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1089 const AddrInfo &Info = *I;
1090 if (!Info.Inst->mayWriteToMemory())
1091 continue;
1092 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1093 StoreGroups.emplace_back(args: Info, args: Group.front().Inst, args: isHvx(AI: Info), args: false);
1094 }
1095
1096 // Erase singleton groups.
1097 erase_if(container&: StoreGroups, p: [](const MoveGroup &G) { return G.Main.size() <= 1; });
1098
1099 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1100 if (!HVC.HST.useHVXV62Ops())
1101 erase_if(container&: StoreGroups, p: [](const MoveGroup &G) { return G.IsHvx; });
1102
1103 // Erase groups where every store is a full HVX vector. The reason is that
1104 // aligning predicated stores generates complex code that may be less
1105 // efficient than a sequence of unaligned vector stores.
1106 if (!VADoFullStores) {
1107 erase_if(container&: StoreGroups, p: [this](const MoveGroup &G) {
1108 return G.IsHvx && llvm::all_of(Range: G.Main, P: [this](Instruction *S) {
1109 auto MaybeInfo = this->getAddrInfo(In&: *S);
1110 assert(MaybeInfo.has_value());
1111 return HVC.HST.isHVXVectorType(
1112 VecTy: EVT::getEVT(Ty: MaybeInfo->ValTy, HandleUnknown: false));
1113 });
1114 });
1115 }
1116
1117 return StoreGroups;
1118}
1119
1120auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1121 // Move all instructions to be adjacent.
1122 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1123 Instruction *Where = Move.Main.front();
1124
1125 if (Move.IsLoad) {
1126 // Move all the loads (and dependencies) to where the first load is.
1127 // Clone all deps to before Where, keeping order.
1128 Move.Clones = cloneBefore(To: Where->getIterator(), Insts&: Move.Deps);
1129 // Move all main instructions to after Where, keeping order.
1130 ArrayRef<Instruction *> Main(Move.Main);
1131 for (Instruction *M : Main) {
1132 if (M != Where)
1133 M->moveAfter(MovePos: Where);
1134 for (auto [Old, New] : Move.Clones)
1135 M->replaceUsesOfWith(From: Old, To: New);
1136 Where = M;
1137 }
1138 // Replace Deps with the clones.
1139 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1140 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1141 } else {
1142 // Move all the stores to where the last store is.
1143 // NOTE: Deps are empty for "store" groups. If they need to be
1144 // non-empty, decide on the order.
1145 assert(Move.Deps.empty());
1146 // Move all main instructions to before Where, inverting order.
1147 ArrayRef<Instruction *> Main(Move.Main);
1148 for (Instruction *M : Main.drop_front(N: 1)) {
1149 M->moveBefore(InsertPos: Where->getIterator());
1150 Where = M;
1151 }
1152 }
1153
1154 return Move.Main.size() + Move.Deps.size() > 1;
1155}
1156
1157template <typename T>
1158auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1159 -> InstMap {
1160 InstMap Map;
1161
1162 for (Instruction *I : Insts) {
1163 assert(HVC.isSafeToClone(*I));
1164 Instruction *C = I->clone();
1165 C->setName(Twine("c.") + I->getName() + ".");
1166 C->insertBefore(InsertPos: To);
1167
1168 for (auto [Old, New] : Map)
1169 C->replaceUsesOfWith(From: Old, To: New);
1170 Map.insert(KV: std::make_pair(x&: I, y&: C));
1171 }
1172 return Map;
1173}
1174
1175auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1176 const ByteSpan &VSpan, int ScLen,
1177 Value *AlignVal, Value *AlignAddr) const
1178 -> void {
1179 LLVM_DEBUG(dbgs() << __func__ << "\n");
1180
1181 Type *SecTy = HVC.getByteTy(ElemCount: ScLen);
1182 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1183 bool DoAlign = !HVC.isZero(Val: AlignVal);
1184 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1185 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1186
1187 ByteSpan ASpan;
1188 auto *True = Constant::getAllOnesValue(Ty: HVC.getBoolTy(ElemCount: ScLen));
1189 auto *Undef = UndefValue::get(T: SecTy);
1190
1191 // Created load does not have to be "Instruction" (e.g. "undef").
1192 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1193
1194 // We could create all of the aligned loads, and generate the valigns
1195 // at the location of the first load, but for large load groups, this
1196 // could create highly suboptimal code (there have been groups of 140+
1197 // loads in real code).
1198 // Instead, place the loads/valigns as close to the users as possible.
1199 // In any case we need to have a mapping from the blocks of VSpan (the
1200 // span covered by the pre-existing loads) to ASpan (the span covered
1201 // by the aligned loads). There is a small problem, though: ASpan needs
1202 // to have pointers to the loads/valigns, but we don't have these loads
1203 // because we don't know where to put them yet. We find out by creating
1204 // a section of ASpan that corresponds to values (blocks) from VSpan,
1205 // and checking where the new load should be placed. We need to attach
1206 // this location information to each block in ASpan somehow, so we put
1207 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1208 // to store the location for each Seg.Val.
1209 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1210 // which helps with printing ByteSpans without crashing when printing
1211 // Segments with these temporary identifiers in place of Val.
1212
1213 // Populate the blocks first, to avoid reallocations of the vector
1214 // interfering with generating the placeholder addresses.
1215 for (int Index = 0; Index != NumSectors; ++Index)
1216 ASpan.Blocks.emplace_back(args: nullptr, args&: ScLen, args: Index * ScLen);
1217 for (int Index = 0; Index != NumSectors; ++Index) {
1218 ASpan.Blocks[Index].Seg.Val =
1219 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1220 }
1221
1222 // Multiple values from VSpan can map to the same value in ASpan. Since we
1223 // try to create loads lazily, we need to find the earliest use for each
1224 // value from ASpan.
1225 DenseMap<void *, Instruction *> EarliestUser;
1226 auto isEarlier = [](Instruction *A, Instruction *B) {
1227 if (B == nullptr)
1228 return true;
1229 if (A == nullptr)
1230 return false;
1231 assert(A->getParent() == B->getParent());
1232 return A->comesBefore(Other: B);
1233 };
1234 auto earliestUser = [&](const auto &Uses) {
1235 Instruction *User = nullptr;
1236 for (const Use &U : Uses) {
1237 auto *I = dyn_cast<Instruction>(Val: U.getUser());
1238 assert(I != nullptr && "Load used in a non-instruction?");
1239 // Make sure we only consider users in this block, but we need
1240 // to remember if there were users outside the block too. This is
1241 // because if no users are found, aligned loads will not be created.
1242 if (I->getParent() == BaseBlock) {
1243 if (!isa<PHINode>(Val: I))
1244 User = std::min(a: User, b: I, comp: isEarlier);
1245 } else {
1246 User = std::min(a: User, b: BaseBlock->getTerminator(), comp: isEarlier);
1247 }
1248 }
1249 return User;
1250 };
1251
1252 for (const ByteSpan::Block &B : VSpan) {
1253 ByteSpan ASection = ASpan.section(Start: B.Pos, Length: B.Seg.Size);
1254 for (const ByteSpan::Block &S : ASection) {
1255 auto &EU = EarliestUser[S.Seg.Val];
1256 EU = std::min(a: EU, b: earliestUser(B.Seg.Val->uses()), comp: isEarlier);
1257 }
1258 }
1259
1260 LLVM_DEBUG({
1261 dbgs() << "ASpan:\n" << ASpan << '\n';
1262 dbgs() << "Earliest users of ASpan:\n";
1263 for (auto &[Val, User] : EarliestUser) {
1264 dbgs() << Val << "\n ->" << *User << '\n';
1265 }
1266 });
1267
1268 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1269 int Index, bool MakePred) {
1270 Value *Ptr =
1271 createAdjustedPointer(Builder, Ptr: AlignAddr, ValTy: SecTy, Adjust: Index * ScLen);
1272 Value *Predicate =
1273 MakePred ? makeTestIfUnaligned(Builder, AlignVal, Alignment: ScLen) : nullptr;
1274
1275 // If vector shifting is potentially needed, accumulate metadata
1276 // from source sections of twice the load width.
1277 int Start = (Index - DoAlign) * ScLen;
1278 int Width = (1 + DoAlign) * ScLen;
1279 return this->createLoad(Builder, ValTy: SecTy, Ptr, Predicate, Alignment: ScLen, Mask: True, PassThru: Undef,
1280 MDSources: VSpan.section(Start, Length: Width).values());
1281 };
1282
1283 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1284 // Move In and its upward dependencies to before To.
1285 assert(In->getParent() == To->getParent());
1286 DepList Deps = getUpwardDeps(In: &*In, Base: &*To);
1287 In->moveBefore(InsertPos: To);
1288 // DepList is sorted with respect to positions in the basic block.
1289 InstMap Map = cloneBefore(To: In, Insts&: Deps);
1290 for (auto [Old, New] : Map)
1291 In->replaceUsesOfWith(From: Old, To: New);
1292 };
1293
1294 // Generate necessary loads at appropriate locations.
1295 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1296 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1297 // In ASpan, each block will be either a single aligned load, or a
1298 // valign of a pair of loads. In the latter case, an aligned load j
1299 // will belong to the current valign, and the one in the previous
1300 // block (for j > 0).
1301 // Place the load at a location which will dominate the valign, assuming
1302 // the valign will be placed right before the earliest user.
1303 Instruction *PrevAt =
1304 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1305 Instruction *ThisAt =
1306 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1307 if (auto *Where = std::min(a: PrevAt, b: ThisAt, comp: isEarlier)) {
1308 Builder.SetInsertPoint(Where);
1309 Loads[Index] =
1310 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1311 // We know it's safe to put the load at BasePos, but we'd prefer to put
1312 // it at "Where". To see if the load is safe to be placed at Where, put
1313 // it there first and then check if it's safe to move it to BasePos.
1314 // If not, then the load needs to be placed at BasePos.
1315 // We can't do this check proactively because we need the load to exist
1316 // in order to check legality.
1317 if (auto *Load = dyn_cast<Instruction>(Val: Loads[Index])) {
1318 if (!HVC.isSafeToMoveBeforeInBB(In: *Load, To: BasePos))
1319 moveBefore(Load->getIterator(), BasePos);
1320 }
1321 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1322 }
1323 }
1324
1325 // Generate valigns if needed, and fill in proper values in ASpan
1326 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1327 for (int Index = 0; Index != NumSectors; ++Index) {
1328 ASpan[Index].Seg.Val = nullptr;
1329 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1330 Builder.SetInsertPoint(Where);
1331 Value *Val = Loads[Index];
1332 assert(Val != nullptr);
1333 if (DoAlign) {
1334 Value *NextLoad = Loads[Index + 1];
1335 assert(NextLoad != nullptr);
1336 Val = HVC.vralignb(Builder, Lo: Val, Hi: NextLoad, Amt: AlignVal);
1337 }
1338 ASpan[Index].Seg.Val = Val;
1339 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1340 }
1341 }
1342
1343 for (const ByteSpan::Block &B : VSpan) {
1344 ByteSpan ASection = ASpan.section(Start: B.Pos, Length: B.Seg.Size).shift(Offset: -B.Pos);
1345 Value *Accum = UndefValue::get(T: HVC.getByteTy(ElemCount: B.Seg.Size));
1346 Builder.SetInsertPoint(cast<Instruction>(Val: B.Seg.Val));
1347
1348 // We're generating a reduction, where each instruction depends on
1349 // the previous one, so we need to order them according to the position
1350 // of their inputs in the code.
1351 std::vector<ByteSpan::Block *> ABlocks;
1352 for (ByteSpan::Block &S : ASection) {
1353 if (S.Seg.Val != nullptr)
1354 ABlocks.push_back(x: &S);
1355 }
1356 llvm::sort(C&: ABlocks,
1357 Comp: [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1358 return isEarlier(cast<Instruction>(Val: A->Seg.Val),
1359 cast<Instruction>(Val: B->Seg.Val));
1360 });
1361 for (ByteSpan::Block *S : ABlocks) {
1362 // The processing of the data loaded by the aligned loads
1363 // needs to be inserted after the data is available.
1364 Instruction *SegI = cast<Instruction>(Val: S->Seg.Val);
1365 Builder.SetInsertPoint(&*std::next(x: SegI->getIterator()));
1366 Value *Pay = HVC.vbytes(Builder, Val: getPayload(Val: S->Seg.Val));
1367 Accum =
1368 HVC.insertb(Builder, Dest: Accum, Src: Pay, Start: S->Seg.Start, Length: S->Seg.Size, Where: S->Pos);
1369 }
1370 // Instead of casting everything to bytes for the vselect, cast to the
1371 // original value type. This will avoid complications with casting masks.
1372 // For example, in cases when the original mask applied to i32, it could
1373 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1374 // but if the mask is not exactly of HVX length, extra handling would be
1375 // needed to make it work.
1376 Type *ValTy = getPayload(Val: B.Seg.Val)->getType();
1377 Value *Cast = Builder.CreateBitCast(V: Accum, DestTy: ValTy, Name: "cst");
1378 Value *Sel = Builder.CreateSelect(C: getMask(Val: B.Seg.Val), True: Cast,
1379 False: getPassThrough(Val: B.Seg.Val), Name: "sel");
1380 B.Seg.Val->replaceAllUsesWith(V: Sel);
1381 }
1382}
1383
1384auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1385 const ByteSpan &VSpan, int ScLen,
1386 Value *AlignVal, Value *AlignAddr) const
1387 -> void {
1388 LLVM_DEBUG(dbgs() << __func__ << "\n");
1389
1390 Type *SecTy = HVC.getByteTy(ElemCount: ScLen);
1391 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1392 bool DoAlign = !HVC.isZero(Val: AlignVal);
1393
1394 // Stores.
1395 ByteSpan ASpanV, ASpanM;
1396
1397 // Return a vector value corresponding to the input value Val:
1398 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1399 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1400 Type *Ty = Val->getType();
1401 if (Ty->isVectorTy())
1402 return Val;
1403 auto *VecTy = VectorType::get(ElementType: Ty, NumElements: 1, /*Scalable=*/false);
1404 return Builder.CreateBitCast(V: Val, DestTy: VecTy, Name: "cst");
1405 };
1406
1407 // Create an extra "undef" sector at the beginning and at the end.
1408 // They will be used as the left/right filler in the vlalign step.
1409 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1410 // For stores, the size of each section is an aligned vector length.
1411 // Adjust the store offsets relative to the section start offset.
1412 ByteSpan VSection =
1413 VSpan.section(Start: Index * ScLen, Length: ScLen).shift(Offset: -Index * ScLen);
1414 Value *Undef = UndefValue::get(T: SecTy);
1415 Value *Zero = Constant::getNullValue(Ty: SecTy);
1416 Value *AccumV = Undef;
1417 Value *AccumM = Zero;
1418 for (ByteSpan::Block &S : VSection) {
1419 Value *Pay = getPayload(Val: S.Seg.Val);
1420 Value *Mask = HVC.rescale(Builder, Mask: MakeVec(Builder, getMask(Val: S.Seg.Val)),
1421 FromTy: Pay->getType(), ToTy: HVC.getByteTy());
1422 Value *PartM = HVC.insertb(Builder, Dest: Zero, Src: HVC.vbytes(Builder, Val: Mask),
1423 Start: S.Seg.Start, Length: S.Seg.Size, Where: S.Pos);
1424 AccumM = Builder.CreateOr(LHS: AccumM, RHS: PartM);
1425
1426 Value *PartV = HVC.insertb(Builder, Dest: Undef, Src: HVC.vbytes(Builder, Val: Pay),
1427 Start: S.Seg.Start, Length: S.Seg.Size, Where: S.Pos);
1428
1429 AccumV = Builder.CreateSelect(
1430 C: Builder.CreateICmp(P: CmpInst::ICMP_NE, LHS: PartM, RHS: Zero), True: PartV, False: AccumV);
1431 }
1432 ASpanV.Blocks.emplace_back(args&: AccumV, args&: ScLen, args: Index * ScLen);
1433 ASpanM.Blocks.emplace_back(args&: AccumM, args&: ScLen, args: Index * ScLen);
1434 }
1435
1436 LLVM_DEBUG({
1437 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1438 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1439 });
1440
1441 // vlalign
1442 if (DoAlign) {
1443 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1444 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1445 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1446 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1447 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, Lo: PrevV, Hi: ThisV, Amt: AlignVal);
1448 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, Lo: PrevM, Hi: ThisM, Amt: AlignVal);
1449 }
1450 }
1451
1452 LLVM_DEBUG({
1453 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1454 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1455 });
1456
1457 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1458 const ByteSpan &ASpanM, int Index, bool MakePred) {
1459 Value *Val = ASpanV[Index].Seg.Val;
1460 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1461 if (HVC.isUndef(Val) || HVC.isZero(Val: Mask))
1462 return;
1463 Value *Ptr =
1464 createAdjustedPointer(Builder, Ptr: AlignAddr, ValTy: SecTy, Adjust: Index * ScLen);
1465 Value *Predicate =
1466 MakePred ? makeTestIfUnaligned(Builder, AlignVal, Alignment: ScLen) : nullptr;
1467
1468 // If vector shifting is potentially needed, accumulate metadata
1469 // from source sections of twice the store width.
1470 int Start = (Index - DoAlign) * ScLen;
1471 int Width = (1 + DoAlign) * ScLen;
1472 this->createStore(Builder, Val, Ptr, Predicate, Alignment: ScLen,
1473 Mask: HVC.vlsb(Builder, Val: Mask),
1474 MDSources: VSpan.section(Start, Length: Width).values());
1475 };
1476
1477 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1478 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1479 }
1480}
1481
1482auto AlignVectors::realignGroup(const MoveGroup &Move) -> bool {
1483 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1484
1485 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1486 if (!Move.IsHvx)
1487 return false;
1488
1489 // Return the element with the maximum alignment from Range,
1490 // where GetValue obtains the value to compare from an element.
1491 auto getMaxOf = [](auto Range, auto GetValue) {
1492 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1493 return GetValue(A) < GetValue(B);
1494 });
1495 };
1496
1497 AddrList &BaseInfos = AddrGroups[Move.Base];
1498
1499 // Conceptually, there is a vector of N bytes covering the addresses
1500 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1501 // represents a contiguous memory region that spans all accessed memory
1502 // locations.
1503 // The correspondence between loaded or stored values will be expressed
1504 // in terms of this vector. For example, the 0th element of the vector
1505 // from the Base address info will start at byte Start from the beginning
1506 // of this conceptual vector.
1507 //
1508 // This vector will be loaded/stored starting at the nearest down-aligned
1509 // address and the amount of the down-alignment will be AlignVal:
1510 // valign(load_vector(align_down(Base+Start)), AlignVal)
1511
1512 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1513 AddrList MoveInfos;
1514
1515 llvm::copy_if(
1516 Range&: BaseInfos, Out: std::back_inserter(x&: MoveInfos),
1517 P: [&TestSet](const AddrInfo &AI) { return TestSet.count(x: AI.Inst); });
1518
1519 // Maximum alignment present in the whole address group.
1520 const AddrInfo &WithMaxAlign =
1521 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1522 Align MaxGiven = WithMaxAlign.HaveAlign;
1523
1524 // Minimum alignment present in the move address group.
1525 const AddrInfo &WithMinOffset =
1526 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1527
1528 const AddrInfo &WithMaxNeeded =
1529 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1530 Align MinNeeded = WithMaxNeeded.NeedAlign;
1531
1532 // Set the builder's insertion point right before the load group, or
1533 // immediately after the store group. (Instructions in a store group are
1534 // listed in reverse order.)
1535 Instruction *InsertAt = Move.Main.front();
1536 if (!Move.IsLoad) {
1537 // There should be a terminator (which store isn't, but check anyways).
1538 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1539 InsertAt = &*std::next(x: InsertAt->getIterator());
1540 }
1541
1542 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1543 InstSimplifyFolder(HVC.DL));
1544 Value *AlignAddr = nullptr; // Actual aligned address.
1545 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1546
1547 if (MinNeeded <= MaxGiven) {
1548 int Start = WithMinOffset.Offset;
1549 int OffAtMax = WithMaxAlign.Offset;
1550 // Shift the offset of the maximally aligned instruction (OffAtMax)
1551 // back by just enough multiples of the required alignment to cover the
1552 // distance from Start to OffAtMax.
1553 // Calculate the address adjustment amount based on the address with the
1554 // maximum alignment. This is to allow a simple gep instruction instead
1555 // of potential bitcasts to i8*.
1556 int Adjust = -alignTo(Value: OffAtMax - Start, Align: MinNeeded.value());
1557 AlignAddr = createAdjustedPointer(Builder, Ptr: WithMaxAlign.Addr,
1558 ValTy: WithMaxAlign.ValTy, Adjust, CloneMap: Move.Clones);
1559 int Diff = Start - (OffAtMax + Adjust);
1560 AlignVal = HVC.getConstInt(Val: Diff);
1561 assert(Diff >= 0);
1562 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1563 } else {
1564 // WithMinOffset is the lowest address in the group,
1565 // WithMinOffset.Addr = Base+Start.
1566 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1567 // mask off unnecessary bits, so it's ok to just the original pointer as
1568 // the alignment amount.
1569 // Do an explicit down-alignment of the address to avoid creating an
1570 // aligned instruction with an address that is not really aligned.
1571 AlignAddr =
1572 createAlignedPointer(Builder, Ptr: WithMinOffset.Addr, ValTy: WithMinOffset.ValTy,
1573 Alignment: MinNeeded.value(), CloneMap: Move.Clones);
1574 AlignVal =
1575 Builder.CreatePtrToInt(V: WithMinOffset.Addr, DestTy: HVC.getIntTy(), Name: "pti");
1576 if (auto *I = dyn_cast<Instruction>(Val: AlignVal)) {
1577 for (auto [Old, New] : Move.Clones)
1578 I->replaceUsesOfWith(From: Old, To: New);
1579 }
1580 }
1581
1582 ByteSpan VSpan;
1583 for (const AddrInfo &AI : MoveInfos) {
1584 VSpan.Blocks.emplace_back(args: AI.Inst, args: HVC.getSizeOf(Ty: AI.ValTy),
1585 args: AI.Offset - WithMinOffset.Offset);
1586 }
1587
1588 // The aligned loads/stores will use blocks that are either scalars,
1589 // or HVX vectors. Let "sector" be the unified term for such a block.
1590 // blend(scalar, vector) -> sector...
1591 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1592 : std::max<int>(a: MinNeeded.value(), b: 4);
1593 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1594 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1595
1596 LLVM_DEBUG({
1597 dbgs() << "ScLen: " << ScLen << "\n";
1598 dbgs() << "AlignVal:" << *AlignVal << "\n";
1599 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1600 dbgs() << "VSpan:\n" << VSpan << '\n';
1601 });
1602
1603 if (Move.IsLoad)
1604 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1605 else
1606 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1607
1608 for (auto *Inst : Move.Main)
1609 Inst->eraseFromParent();
1610
1611 return true;
1612}
1613
1614auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1615 int Alignment) const -> Value * {
1616 auto *AlignTy = AlignVal->getType();
1617 Value *And = Builder.CreateAnd(
1618 LHS: AlignVal, RHS: ConstantInt::get(Ty: AlignTy, V: Alignment - 1), Name: "and");
1619 Value *Zero = ConstantInt::get(Ty: AlignTy, V: 0);
1620 return Builder.CreateICmpNE(LHS: And, RHS: Zero, Name: "isz");
1621}
1622
1623auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1624 if (!HVC.isByteVecTy(Ty))
1625 return false;
1626 int Size = HVC.getSizeOf(Ty);
1627 if (HVC.HST.isTypeForHVX(VecTy: Ty))
1628 return Size == static_cast<int>(HVC.HST.getVectorLength());
1629 return Size == 4 || Size == 8;
1630}
1631
1632auto AlignVectors::run() -> bool {
1633 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1634 << '\n');
1635 if (!createAddressGroups())
1636 return false;
1637
1638 LLVM_DEBUG({
1639 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1640 for (auto &[In, AL] : AddrGroups) {
1641 for (const AddrInfo &AI : AL)
1642 dbgs() << "---\n" << AI << '\n';
1643 }
1644 });
1645
1646 bool Changed = false;
1647 MoveList LoadGroups, StoreGroups;
1648
1649 for (auto &G : AddrGroups) {
1650 llvm::append_range(C&: LoadGroups, R: createLoadGroups(Group: G.second));
1651 llvm::append_range(C&: StoreGroups, R: createStoreGroups(Group: G.second));
1652 }
1653
1654 LLVM_DEBUG({
1655 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1656 for (const MoveGroup &G : LoadGroups)
1657 dbgs() << G << "\n";
1658 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1659 for (const MoveGroup &G : StoreGroups)
1660 dbgs() << G << "\n";
1661 });
1662
1663 // Cumulative limit on the number of groups.
1664 unsigned CountLimit = VAGroupCountLimit;
1665 if (CountLimit == 0)
1666 return false;
1667
1668 if (LoadGroups.size() > CountLimit) {
1669 LoadGroups.resize(new_size: CountLimit);
1670 StoreGroups.clear();
1671 } else {
1672 unsigned StoreLimit = CountLimit - LoadGroups.size();
1673 if (StoreGroups.size() > StoreLimit)
1674 StoreGroups.resize(new_size: StoreLimit);
1675 }
1676
1677 for (auto &M : LoadGroups)
1678 Changed |= moveTogether(Move&: M);
1679 for (auto &M : StoreGroups)
1680 Changed |= moveTogether(Move&: M);
1681
1682 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1683
1684 for (auto &M : LoadGroups)
1685 Changed |= realignGroup(Move: M);
1686 for (auto &M : StoreGroups)
1687 Changed |= realignGroup(Move: M);
1688
1689 return Changed;
1690}
1691
1692// --- End AlignVectors
1693
1694// --- Begin HvxIdioms
1695
1696auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1697 -> std::pair<unsigned, Signedness> {
1698 unsigned Bits = HVC.getNumSignificantBits(V, CtxI: In);
1699 // The significant bits are calculated including the sign bit. This may
1700 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1701 // result in 33 significant bits. To avoid extra words, skip the extra
1702 // sign bit, but keep information that the value is to be treated as
1703 // unsigned.
1704 KnownBits Known = HVC.getKnownBits(V, CtxI: In);
1705 Signedness Sign = Signed;
1706 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1707 if (isPowerOf2_32(Value: Bits))
1708 NumToTest = Bits;
1709 else if (Bits > 1 && isPowerOf2_32(Value: Bits - 1))
1710 NumToTest = Bits - 1;
1711
1712 if (NumToTest != 0 && Known.Zero.ashr(ShiftAmt: NumToTest).isAllOnes()) {
1713 Sign = Unsigned;
1714 Bits = NumToTest;
1715 }
1716
1717 // If the top bit of the nearest power-of-2 is zero, this value is
1718 // positive. It could be treated as either signed or unsigned.
1719 if (unsigned Pow2 = PowerOf2Ceil(A: Bits); Pow2 != Bits) {
1720 if (Known.Zero.ashr(ShiftAmt: Pow2 - 1).isAllOnes())
1721 Sign = Positive;
1722 }
1723 return {Bits, Sign};
1724}
1725
1726auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1727 -> std::pair<SValue, SValue> {
1728 // Canonicalize the signedness of X and Y, so that the result is one of:
1729 // S, S
1730 // U/P, S
1731 // U/P, U/P
1732 if (X.Sgn == Signed && Y.Sgn != Signed)
1733 std::swap(a&: X, b&: Y);
1734 return {X, Y};
1735}
1736
1737// Match
1738// (X * Y) [>> N], or
1739// ((X * Y) + (1 << M)) >> N
1740auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1741 using namespace PatternMatch;
1742 auto *Ty = In.getType();
1743
1744 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1745 return std::nullopt;
1746
1747 unsigned Width = cast<IntegerType>(Val: Ty->getScalarType())->getBitWidth();
1748
1749 FxpOp Op;
1750 Value *Exp = &In;
1751
1752 // Fixed-point multiplication is always shifted right (except when the
1753 // fraction is 0 bits).
1754 auto m_Shr = [](auto &&V, auto &&S) {
1755 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1756 };
1757
1758 uint64_t Qn = 0;
1759 if (Value *T; match(V: Exp, P: m_Shr(m_Value(V&: T), m_ConstantInt(V&: Qn)))) {
1760 Op.Frac = Qn;
1761 Exp = T;
1762 } else {
1763 Op.Frac = 0;
1764 }
1765
1766 if (Op.Frac > Width)
1767 return std::nullopt;
1768
1769 // Check if there is rounding added.
1770 uint64_t CV;
1771 if (Value *T;
1772 Op.Frac > 0 && match(V: Exp, P: m_Add(L: m_Value(V&: T), R: m_ConstantInt(V&: CV)))) {
1773 if (CV != 0 && !isPowerOf2_64(Value: CV))
1774 return std::nullopt;
1775 if (CV != 0)
1776 Op.RoundAt = Log2_64(Value: CV);
1777 Exp = T;
1778 }
1779
1780 // Check if the rest is a multiplication.
1781 if (match(V: Exp, P: m_Mul(L: m_Value(V&: Op.X.Val), R: m_Value(V&: Op.Y.Val)))) {
1782 Op.Opcode = Instruction::Mul;
1783 // FIXME: The information below is recomputed.
1784 Op.X.Sgn = getNumSignificantBits(V: Op.X.Val, In: &In).second;
1785 Op.Y.Sgn = getNumSignificantBits(V: Op.Y.Val, In: &In).second;
1786 Op.ResTy = cast<VectorType>(Val: Ty);
1787 return Op;
1788 }
1789
1790 return std::nullopt;
1791}
1792
1793auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1794 -> Value * {
1795 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1796
1797 auto *VecTy = dyn_cast<VectorType>(Val: Op.X.Val->getType());
1798 if (VecTy == nullptr)
1799 return nullptr;
1800 auto *ElemTy = cast<IntegerType>(Val: VecTy->getElementType());
1801 unsigned ElemWidth = ElemTy->getBitWidth();
1802
1803 // TODO: This can be relaxed after legalization is done pre-isel.
1804 if ((HVC.length(Ty: VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1805 return nullptr;
1806
1807 // There are no special intrinsics that should be used for multiplying
1808 // signed 8-bit values, so just skip them. Normal codegen should handle
1809 // this just fine.
1810 if (ElemWidth <= 8)
1811 return nullptr;
1812 // Similarly, if this is just a multiplication that can be handled without
1813 // intervention, then leave it alone.
1814 if (ElemWidth <= 32 && Op.Frac == 0)
1815 return nullptr;
1816
1817 auto [BitsX, SignX] = getNumSignificantBits(V: Op.X.Val, In: &In);
1818 auto [BitsY, SignY] = getNumSignificantBits(V: Op.Y.Val, In: &In);
1819
1820 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1821
1822 Value *X = Op.X.Val, *Y = Op.Y.Val;
1823 IRBuilder Builder(In.getParent(), In.getIterator(),
1824 InstSimplifyFolder(HVC.DL));
1825
1826 auto roundUpWidth = [](unsigned Width) -> unsigned {
1827 if (Width <= 32 && !isPowerOf2_32(Value: Width)) {
1828 // If the element width is not a power of 2, round it up
1829 // to the next one. Do this for widths not exceeding 32.
1830 return PowerOf2Ceil(A: Width);
1831 }
1832 if (Width > 32 && Width % 32 != 0) {
1833 // For wider elements, round it up to the multiple of 32.
1834 return alignTo(Value: Width, Align: 32u);
1835 }
1836 return Width;
1837 };
1838
1839 BitsX = roundUpWidth(BitsX);
1840 BitsY = roundUpWidth(BitsY);
1841
1842 // For elementwise multiplication vectors must have the same lengths, so
1843 // resize the elements of both inputs to the same width, the max of the
1844 // calculated significant bits.
1845 unsigned Width = std::max(a: BitsX, b: BitsY);
1846
1847 auto *ResizeTy = VectorType::get(ElementType: HVC.getIntTy(Width), Other: VecTy);
1848 if (Width < ElemWidth) {
1849 X = Builder.CreateTrunc(V: X, DestTy: ResizeTy, Name: "trn");
1850 Y = Builder.CreateTrunc(V: Y, DestTy: ResizeTy, Name: "trn");
1851 } else if (Width > ElemWidth) {
1852 X = SignX == Signed ? Builder.CreateSExt(V: X, DestTy: ResizeTy, Name: "sxt")
1853 : Builder.CreateZExt(V: X, DestTy: ResizeTy, Name: "zxt");
1854 Y = SignY == Signed ? Builder.CreateSExt(V: Y, DestTy: ResizeTy, Name: "sxt")
1855 : Builder.CreateZExt(V: Y, DestTy: ResizeTy, Name: "zxt");
1856 };
1857
1858 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1859
1860 unsigned VecLen = HVC.length(Ty: ResizeTy);
1861 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(a: Width, b: 32u);
1862
1863 SmallVector<Value *> Results;
1864 FxpOp ChopOp = Op;
1865 ChopOp.ResTy = VectorType::get(ElementType: Op.ResTy->getElementType(), NumElements: ChopLen, Scalable: false);
1866
1867 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1868 ChopOp.X.Val = HVC.subvector(Builder, Val: X, Start: V * ChopLen, Length: ChopLen);
1869 ChopOp.Y.Val = HVC.subvector(Builder, Val: Y, Start: V * ChopLen, Length: ChopLen);
1870 Results.push_back(Elt: processFxpMulChopped(Builder, In, Op: ChopOp));
1871 if (Results.back() == nullptr)
1872 break;
1873 }
1874
1875 if (Results.empty() || Results.back() == nullptr)
1876 return nullptr;
1877
1878 Value *Cat = HVC.concat(Builder, Vecs: Results);
1879 Value *Ext = SignX == Signed || SignY == Signed
1880 ? Builder.CreateSExt(V: Cat, DestTy: VecTy, Name: "sxt")
1881 : Builder.CreateZExt(V: Cat, DestTy: VecTy, Name: "zxt");
1882 return Ext;
1883}
1884
1885inline bool HvxIdioms::matchScatter(Instruction &In) const {
1886 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1887 if (!II)
1888 return false;
1889 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1890}
1891
1892inline bool HvxIdioms::matchGather(Instruction &In) const {
1893 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1894 if (!II)
1895 return false;
1896 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1897}
1898
1899inline bool HvxIdioms::matchMLoad(Instruction &In) const {
1900 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1901 if (!II)
1902 return false;
1903 return (II->getIntrinsicID() == Intrinsic::masked_load);
1904}
1905
1906inline bool HvxIdioms::matchMStore(Instruction &In) const {
1907 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &In);
1908 if (!II)
1909 return false;
1910 return (II->getIntrinsicID() == Intrinsic::masked_store);
1911}
1912
1913Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1914
1915// Binary instructions we want to handle as users of gather/scatter.
1916inline bool isArithmetic(unsigned Opc) {
1917 switch (Opc) {
1918 case Instruction::Add:
1919 case Instruction::Sub:
1920 case Instruction::Mul:
1921 case Instruction::And:
1922 case Instruction::Or:
1923 case Instruction::Xor:
1924 case Instruction::AShr:
1925 case Instruction::LShr:
1926 case Instruction::Shl:
1927 case Instruction::UDiv:
1928 return true;
1929 }
1930 return false;
1931}
1932
1933// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1934inline Value *getPointer(Value *Ptr) {
1935 assert(Ptr && "Unable to extract pointer");
1936 if (isa<AllocaInst>(Val: Ptr) || isa<Argument>(Val: Ptr) || isa<GlobalValue>(Val: Ptr))
1937 return Ptr;
1938 if (isa<LoadInst>(Val: Ptr) || isa<StoreInst>(Val: Ptr))
1939 return getLoadStorePointerOperand(V: Ptr);
1940 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Ptr)) {
1941 if (II->getIntrinsicID() == Intrinsic::masked_store)
1942 return II->getOperand(i_nocapture: 1);
1943 }
1944 return nullptr;
1945}
1946
1947static Instruction *selectDestination(Instruction *In,
1948 HvxIdioms::DstQualifier &Qual) {
1949 Instruction *Destination = nullptr;
1950 if (!In)
1951 return Destination;
1952 if (isa<StoreInst>(Val: In)) {
1953 Destination = In;
1954 Qual = HvxIdioms::LdSt;
1955 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: In)) {
1956 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
1957 Destination = In;
1958 Qual = HvxIdioms::LLVM_Gather;
1959 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
1960 Destination = In;
1961 Qual = HvxIdioms::LLVM_Scatter;
1962 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
1963 Destination = In;
1964 Qual = HvxIdioms::LdSt;
1965 } else if (II->getIntrinsicID() ==
1966 Intrinsic::hexagon_V6_vgather_vscattermh) {
1967 Destination = In;
1968 Qual = HvxIdioms::HEX_Gather_Scatter;
1969 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
1970 Destination = In;
1971 Qual = HvxIdioms::HEX_Scatter;
1972 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
1973 Destination = In;
1974 Qual = HvxIdioms::HEX_Gather;
1975 }
1976 } else if (isa<ZExtInst>(Val: In)) {
1977 return locateDestination(In, Qual);
1978 } else if (isa<CastInst>(Val: In)) {
1979 return locateDestination(In, Qual);
1980 } else if (isa<CallInst>(Val: In)) {
1981 Destination = In;
1982 Qual = HvxIdioms::Call;
1983 } else if (isa<GetElementPtrInst>(Val: In)) {
1984 return locateDestination(In, Qual);
1985 } else if (isArithmetic(Opc: In->getOpcode())) {
1986 Destination = In;
1987 Qual = HvxIdioms::Arithmetic;
1988 } else {
1989 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
1990 }
1991 return Destination;
1992}
1993
1994// This method attempts to find destination (user) for a given intrinsic.
1995// Given that these are produced only by Ripple, the number of options is
1996// limited. Simplest case is explicit store which in fact is redundant (since
1997// HVX gater creates its own store during packetization). Nevertheless we need
1998// to figure address where we storing. Other cases are more complicated, but
1999// still few.
2000Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
2001 Instruction *Destination = nullptr;
2002 if (!In)
2003 return Destination;
2004 // Get all possible destinations
2005 SmallVector<Instruction *> Users;
2006 // Iterate over the uses of the instruction
2007 for (auto &U : In->uses()) {
2008 if (auto *UI = dyn_cast<Instruction>(Val: U.getUser())) {
2009 Destination = selectDestination(In: UI, Qual);
2010 if (Destination)
2011 Users.push_back(Elt: Destination);
2012 }
2013 }
2014 // Now see which of the users (if any) is a memory destination.
2015 for (auto *I : Users)
2016 if (getPointer(Ptr: I))
2017 return I;
2018 return Destination;
2019}
2020
2021// The two intrinsics we handle here have GEP in a different position.
2022inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {
2023 assert(In && "Bad instruction");
2024 IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(Val: In);
2025 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
2026 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
2027 "Not a gather Intrinsic");
2028 GetElementPtrInst *GEPIndex = nullptr;
2029 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
2030 GEPIndex = dyn_cast<GetElementPtrInst>(Val: IIn->getOperand(i_nocapture: 0));
2031 else
2032 GEPIndex = dyn_cast<GetElementPtrInst>(Val: IIn->getOperand(i_nocapture: 1));
2033 return GEPIndex;
2034}
2035
2036// Given the intrinsic find its GEP argument and extract base address it uses.
2037// The method relies on the way how Ripple typically forms the GEP for
2038// scatter/gather.
2039static Value *locateAddressFromIntrinsic(Instruction *In) {
2040 GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
2041 if (!GEPIndex) {
2042 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2043 return nullptr;
2044 }
2045 Value *BaseAddress = GEPIndex->getPointerOperand();
2046 auto *IndexLoad = dyn_cast<LoadInst>(Val: BaseAddress);
2047 if (IndexLoad)
2048 return IndexLoad;
2049
2050 auto *IndexZEx = dyn_cast<ZExtInst>(Val: BaseAddress);
2051 if (IndexZEx) {
2052 IndexLoad = dyn_cast<LoadInst>(Val: IndexZEx->getOperand(i_nocapture: 0));
2053 if (IndexLoad)
2054 return IndexLoad;
2055 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: IndexZEx->getOperand(i_nocapture: 0));
2056 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
2057 return locateAddressFromIntrinsic(In: II);
2058 }
2059 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(Val: BaseAddress);
2060 if (BaseShuffle) {
2061 IndexLoad = dyn_cast<LoadInst>(Val: BaseShuffle->getOperand(i_nocapture: 0));
2062 if (IndexLoad)
2063 return IndexLoad;
2064 auto *IE = dyn_cast<InsertElementInst>(Val: BaseShuffle->getOperand(i_nocapture: 0));
2065 if (IE) {
2066 auto *Src = IE->getOperand(i_nocapture: 1);
2067 IndexLoad = dyn_cast<LoadInst>(Val: Src);
2068 if (IndexLoad)
2069 return IndexLoad;
2070 auto *Alloca = dyn_cast<AllocaInst>(Val: Src);
2071 if (Alloca)
2072 return Alloca;
2073 if (isa<Argument>(Val: Src)) {
2074 return Src;
2075 }
2076 if (isa<GlobalValue>(Val: Src)) {
2077 return Src;
2078 }
2079 }
2080 }
2081 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2082 return nullptr;
2083}
2084
2085static Type *getIndexType(Value *In) {
2086 if (!In)
2087 return nullptr;
2088
2089 if (isa<LoadInst>(Val: In) || isa<StoreInst>(Val: In))
2090 return getLoadStoreType(I: In);
2091
2092 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: In)) {
2093 if (II->getIntrinsicID() == Intrinsic::masked_load)
2094 return II->getType();
2095 if (II->getIntrinsicID() == Intrinsic::masked_store)
2096 return II->getOperand(i_nocapture: 0)->getType();
2097 }
2098 return In->getType();
2099}
2100
2101static Value *locateIndexesFromGEP(Value *In) {
2102 if (!In)
2103 return nullptr;
2104 if (isa<LoadInst>(Val: In))
2105 return In;
2106 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: In)) {
2107 if (II->getIntrinsicID() == Intrinsic::masked_load)
2108 return In;
2109 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2110 return In;
2111 }
2112 if (auto *IndexZEx = dyn_cast<ZExtInst>(Val: In))
2113 return locateIndexesFromGEP(In: IndexZEx->getOperand(i_nocapture: 0));
2114 if (auto *IndexSEx = dyn_cast<SExtInst>(Val: In))
2115 return locateIndexesFromGEP(In: IndexSEx->getOperand(i_nocapture: 0));
2116 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(Val: In))
2117 return locateIndexesFromGEP(In: BaseShuffle->getOperand(i_nocapture: 0));
2118 if (auto *IE = dyn_cast<InsertElementInst>(Val: In))
2119 return locateIndexesFromGEP(In: IE->getOperand(i_nocapture: 1));
2120 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: In))
2121 return cstDataVector;
2122 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(Val: In))
2123 return GEPIndex->getOperand(i_nocapture: 0);
2124 return nullptr;
2125}
2126
2127// Given the intrinsic find its GEP argument and extract offsetts from the base
2128// address it uses.
2129static Value *locateIndexesFromIntrinsic(Instruction *In) {
2130 GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
2131 if (!GEPIndex) {
2132 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2133 return nullptr;
2134 }
2135 Value *Indexes = GEPIndex->getOperand(i_nocapture: 1);
2136 if (auto *IndexLoad = locateIndexesFromGEP(In: Indexes))
2137 return IndexLoad;
2138
2139 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2140 return nullptr;
2141}
2142
2143// Because of aukward definition of many Hex intrinsics we often have to
2144// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2145// for all use cases, so this only exist to make IR builder happy.
2146inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2147 IRBuilderBase &Builder,
2148 LLVMContext &Ctx, Value *I) {
2149 assert(I && "Unable to reinterprete cast");
2150 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2151 std::vector<unsigned> shuffleMask;
2152 for (unsigned i = 0; i < 64; ++i)
2153 shuffleMask.push_back(x: i);
2154 Constant *Mask = llvm::ConstantDataVector::get(Context&: Ctx, Elts: shuffleMask);
2155 Value *CastShuffle =
2156 Builder.CreateShuffleVector(V1: I, V2: I, Mask, Name: "identity_shuffle");
2157 return Builder.CreateBitCast(V: CastShuffle, DestTy: NT, Name: "cst64_i16_to_32_i32");
2158}
2159
2160// Recast <128 x i8> as <32 x i32>
2161inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2162 IRBuilderBase &Builder,
2163 LLVMContext &Ctx, Value *I) {
2164 assert(I && "Unable to reinterprete cast");
2165 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2166 std::vector<unsigned> shuffleMask;
2167 for (unsigned i = 0; i < 128; ++i)
2168 shuffleMask.push_back(x: i);
2169 Constant *Mask = llvm::ConstantDataVector::get(Context&: Ctx, Elts: shuffleMask);
2170 Value *CastShuffle =
2171 Builder.CreateShuffleVector(V1: I, V2: I, Mask, Name: "identity_shuffle");
2172 return Builder.CreateBitCast(V: CastShuffle, DestTy: NT, Name: "cst128_i8_to_32_i32");
2173}
2174
2175// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2176inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2177 IRBuilderBase &Builder, LLVMContext &Ctx,
2178 unsigned int pattern) {
2179 std::vector<unsigned int> byteMask;
2180 for (unsigned i = 0; i < 32; ++i)
2181 byteMask.push_back(x: pattern);
2182
2183 return Builder.CreateIntrinsic(
2184 RetTy: HVC.getBoolTy(ElemCount: 128), ID: HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vandvrt),
2185 Args: {llvm::ConstantDataVector::get(Context&: Ctx, Elts: byteMask), HVC.getConstInt(Val: ~0)},
2186 FMFSource: nullptr);
2187}
2188
2189Value *HvxIdioms::processVScatter(Instruction &In) const {
2190 auto *InpTy = dyn_cast<VectorType>(Val: In.getOperand(i: 0)->getType());
2191 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2192 unsigned InpSize = HVC.getSizeOf(Ty: InpTy);
2193 auto *F = In.getFunction();
2194 LLVMContext &Ctx = F->getContext();
2195 auto *ElemTy = dyn_cast<IntegerType>(Val: InpTy->getElementType());
2196 assert(ElemTy && "llvm.scatter needs integer type argument");
2197 unsigned ElemWidth = HVC.DL.getTypeAllocSize(Ty: ElemTy);
2198 LLVM_DEBUG({
2199 unsigned Elements = HVC.length(InpTy);
2200 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2201 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2202 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2203 << ElemWidth << ")\n";
2204 });
2205
2206 IRBuilder Builder(In.getParent(), In.getIterator(),
2207 InstSimplifyFolder(HVC.DL));
2208
2209 auto *ValueToScatter = In.getOperand(i: 0);
2210 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2211
2212 if (HVC.HST.getVectorLength() != InpSize) {
2213 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2214 << ") for vscatter\n");
2215 return nullptr;
2216 }
2217
2218 // Base address of indexes.
2219 auto *IndexLoad = locateAddressFromIntrinsic(In: &In);
2220 if (!IndexLoad)
2221 return nullptr;
2222 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2223
2224 // Address of destination. Must be in VTCM.
2225 auto *Ptr = getPointer(Ptr: IndexLoad);
2226 if (!Ptr)
2227 return nullptr;
2228 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2229 // Indexes/offsets
2230 auto *Indexes = locateIndexesFromIntrinsic(In: &In);
2231 if (!Indexes)
2232 return nullptr;
2233 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2234 Value *CastedDst = Builder.CreateBitOrPointerCast(V: Ptr, DestTy: Type::getInt32Ty(C&: Ctx),
2235 Name: "cst_ptr_to_i32");
2236 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2237 // Adjust Indexes
2238 auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: Indexes);
2239 Value *CastIndex = nullptr;
2240 if (cstDataVector) {
2241 // Our indexes are represented as a constant. We need it in a reg.
2242 Type *IndexVectorType = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2243 AllocaInst *IndexesAlloca = Builder.CreateAlloca(Ty: IndexVectorType);
2244 [[maybe_unused]] auto *StoreIndexes =
2245 Builder.CreateStore(Val: cstDataVector, Ptr: IndexesAlloca);
2246 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2247 CastIndex =
2248 Builder.CreateLoad(Ty: IndexVectorType, Ptr: IndexesAlloca, Name: "reload_index");
2249 } else {
2250 if (ElemWidth == 2)
2251 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: Indexes);
2252 else
2253 CastIndex = Indexes;
2254 }
2255 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2256
2257 if (ElemWidth == 1) {
2258 // v128i8 There is no native instruction for this.
2259 // Do this as two Hi/Lo gathers with masking.
2260 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2261 // Extend indexes. We assume that indexes are in 128i8 format - need to
2262 // expand them to Hi/Lo 64i16
2263 Value *CastIndexes = Builder.CreateBitCast(V: CastIndex, DestTy: NT, Name: "cast_to_32i32");
2264 auto V6_vunpack = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vunpackub);
2265 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2266 RetTy: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: true), ID: V6_vunpack, Args: CastIndexes, FMFSource: nullptr);
2267 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2268
2269 auto V6_hi = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_hi);
2270 auto V6_lo = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_lo);
2271 [[maybe_unused]] Value *IndexHi =
2272 HVC.createHvxIntrinsic(Builder, IntID: V6_hi, RetTy: NT, Args: UnpackedIndexes);
2273 [[maybe_unused]] Value *IndexLo =
2274 HVC.createHvxIntrinsic(Builder, IntID: V6_lo, RetTy: NT, Args: UnpackedIndexes);
2275 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2276 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2277 // Now unpack values to scatter
2278 Value *CastSrc =
2279 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, I: ValueToScatter);
2280 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2281 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2282 RetTy: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: true), ID: V6_vunpack, Args: CastSrc, FMFSource: nullptr);
2283 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2284 << ")\n");
2285
2286 [[maybe_unused]] Value *UVSHi =
2287 HVC.createHvxIntrinsic(Builder, IntID: V6_hi, RetTy: NT, Args: UnpackedValueToScatter);
2288 [[maybe_unused]] Value *UVSLo =
2289 HVC.createHvxIntrinsic(Builder, IntID: V6_lo, RetTy: NT, Args: UnpackedValueToScatter);
2290 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2291 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2292
2293 // Create the mask for individual bytes
2294 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, pattern: 0x00ff00ff);
2295 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2296 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2297 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermhq_128B,
2298 Args: {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2299 IndexHi, UVSHi},
2300 FMFSource: nullptr);
2301 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2302 return Builder.CreateIntrinsic(
2303 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermhq_128B,
2304 Args: {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2305 IndexLo, UVSLo},
2306 FMFSource: nullptr);
2307 } else if (ElemWidth == 2) {
2308 Value *CastSrc =
2309 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: ValueToScatter);
2310 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2311 return Builder.CreateIntrinsic(
2312 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermh_128B,
2313 Args: {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2314 CastSrc},
2315 FMFSource: nullptr);
2316 } else if (ElemWidth == 4) {
2317 return Builder.CreateIntrinsic(
2318 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vscattermw_128B,
2319 Args: {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2320 ValueToScatter},
2321 FMFSource: nullptr);
2322 } else {
2323 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2324 return nullptr;
2325 }
2326}
2327
2328Value *HvxIdioms::processVGather(Instruction &In) const {
2329 [[maybe_unused]] auto *InpTy =
2330 dyn_cast<VectorType>(Val: In.getOperand(i: 0)->getType());
2331 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2332 [[maybe_unused]] auto *ElemTy =
2333 dyn_cast<PointerType>(Val: InpTy->getElementType());
2334 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2335 auto *F = In.getFunction();
2336 LLVMContext &Ctx = F->getContext();
2337 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2338 << *In.getParent() << "\n");
2339 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2340 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2341 << ") type(" << *ElemTy << ") Access alignment("
2342 << *In.getOperand(1) << ") AddressSpace("
2343 << ElemTy->getAddressSpace() << ")\n");
2344
2345 // TODO: Handle masking of elements.
2346 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2347 "llvm.gather needs vector for mask");
2348 IRBuilder Builder(In.getParent(), In.getIterator(),
2349 InstSimplifyFolder(HVC.DL));
2350
2351 // See who is using the result. The difference between LLVM and HVX vgather
2352 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2353 // in VTCM is not yet supported, so for now we just bail out for those cases.
2354 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2355 Instruction *Dst = locateDestination(In: &In, Qual);
2356 if (!Dst) {
2357 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2358 return nullptr;
2359 }
2360 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2361 << ")\n");
2362
2363 // Address of destination. Must be in VTCM.
2364 auto *Ptr = getPointer(Ptr: Dst);
2365 if (!Ptr) {
2366 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2367 return nullptr;
2368 }
2369
2370 // Result type. Assume it is a vector type.
2371 auto *DstType = cast<VectorType>(Val: getIndexType(In: Dst));
2372 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2373
2374 // Base address for sources to be loaded
2375 auto *IndexLoad = locateAddressFromIntrinsic(In: &In);
2376 if (!IndexLoad)
2377 return nullptr;
2378 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2379
2380 // Gather indexes/offsets
2381 auto *Indexes = locateIndexesFromIntrinsic(In: &In);
2382 if (!Indexes)
2383 return nullptr;
2384 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2385
2386 Instruction *Gather = nullptr;
2387 Type *NT = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false);
2388 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2389 // We fully assume the address space is in VTCM. We also assume that all
2390 // pointers in Operand(0) have the same base(!).
2391 // This is the most basic case of all the above.
2392 unsigned OutputSize = HVC.getSizeOf(Ty: DstType);
2393 auto *DstElemTy = cast<IntegerType>(Val: DstType->getElementType());
2394 unsigned ElemWidth = HVC.DL.getTypeAllocSize(Ty: DstElemTy);
2395 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2396 << " Address space ("
2397 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2398 << " Result type : " << *DstType
2399 << "\n Size in bytes : " << OutputSize
2400 << " element type(" << *DstElemTy
2401 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2402
2403 auto *IndexType = cast<VectorType>(Val: getIndexType(In: Indexes));
2404 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2405 unsigned IndexWidth = HVC.DL.getTypeAllocSize(Ty: IndexType->getElementType());
2406 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2407
2408 // Intrinsic takes i32 instead of pointer so cast.
2409 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2410 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2411 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2412 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2413 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2414 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2415 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2416 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2417 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2418 if (HVC.HST.getVectorLength() == OutputSize) {
2419 if (ElemWidth == 1) {
2420 // v128i8 There is no native instruction for this.
2421 // Do this as two Hi/Lo gathers with masking.
2422 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2423 // expand them to Hi/Lo 64i16
2424 Value *CastIndexes =
2425 Builder.CreateBitCast(V: Indexes, DestTy: NT, Name: "cast_to_32i32");
2426 auto V6_vunpack = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vunpackub);
2427 auto *UnpackedIndexes =
2428 Builder.CreateIntrinsic(RetTy: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: true),
2429 ID: V6_vunpack, Args: CastIndexes, FMFSource: nullptr);
2430 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2431 << ")\n");
2432
2433 auto V6_hi = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_hi);
2434 auto V6_lo = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_lo);
2435 [[maybe_unused]] Value *IndexHi =
2436 HVC.createHvxIntrinsic(Builder, IntID: V6_hi, RetTy: NT, Args: UnpackedIndexes);
2437 [[maybe_unused]] Value *IndexLo =
2438 HVC.createHvxIntrinsic(Builder, IntID: V6_lo, RetTy: NT, Args: UnpackedIndexes);
2439 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2440 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2441 // Create the mask for individual bytes
2442 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, pattern: 0x00ff00ff);
2443 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2444 // We use our destination allocation as a temp storage
2445 // This is unlikely to work properly for masked gather.
2446 auto V6_vgather = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vgathermhq);
2447 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2448 RetTy: Type::getVoidTy(C&: Ctx), ID: V6_vgather,
2449 Args: {Ptr, QByteMask, CastedPtr,
2450 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2451 FMFSource: nullptr);
2452 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2453 // Rematerialize the result
2454 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2455 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false), Ptr, Name: "temp_result_hi");
2456 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2457 // Same for the low part. Here we use Gather to return non-NULL result
2458 // from this function and continue to iterate. We also are deleting Dst
2459 // store below.
2460 Gather = Builder.CreateIntrinsic(
2461 RetTy: Type::getVoidTy(C&: Ctx), ID: V6_vgather,
2462 Args: {Ptr, QByteMask, CastedPtr,
2463 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2464 FMFSource: nullptr);
2465 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2466 Value *LoadedResultLo = Builder.CreateLoad(
2467 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 32), Pair: false), Ptr, Name: "temp_result_lo");
2468 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2469 // Now we have properly sized bytes in every other position
2470 // B b A a c a A b B c f F g G h H is presented as
2471 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2472 // Use vpack to gather them
2473 auto V6_vpackeb = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vpackeb);
2474 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2475 RetTy: NT, ID: V6_vpackeb, Args: {LoadedResultHi, LoadedResultLo}, FMFSource: nullptr);
2476 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2477 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Val: Res, Ptr);
2478 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2479 } else if (ElemWidth == 2) {
2480 // v32i16
2481 if (IndexWidth == 2) {
2482 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2483 Value *CastIndex =
2484 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: Indexes);
2485 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2486 // shift all i16 left by 1 to match short addressing mode instead of
2487 // byte.
2488 auto V6_vaslh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaslh);
2489 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2490 Builder, IntID: V6_vaslh, RetTy: NT, Args: {CastIndex, HVC.getConstInt(Val: 1)});
2491 LLVM_DEBUG(dbgs()
2492 << " Shifted half index: " << *AdjustedIndex << ")\n");
2493
2494 auto V6_vgather = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vgathermh);
2495 // The 3rd argument is the size of the region to gather from. Probably
2496 // want to set it to max VTCM size.
2497 Gather = Builder.CreateIntrinsic(
2498 RetTy: Type::getVoidTy(C&: Ctx), ID: V6_vgather,
2499 Args: {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2500 AdjustedIndex},
2501 FMFSource: nullptr);
2502 for (auto &U : Dst->uses()) {
2503 if (auto *UI = dyn_cast<Instruction>(Val: U.getUser()))
2504 dbgs() << " dst used by: " << *UI << "\n";
2505 }
2506 for (auto &U : In.uses()) {
2507 if (auto *UI = dyn_cast<Instruction>(Val: U.getUser()))
2508 dbgs() << " In used by : " << *UI << "\n";
2509 }
2510 // Create temp load from result in case the result is used by any
2511 // other instruction.
2512 Value *LoadedResult = Builder.CreateLoad(
2513 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr, Name: "temp_result");
2514 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2515 In.replaceAllUsesWith(V: LoadedResult);
2516 } else {
2517 dbgs() << " Unhandled index type for vgather\n";
2518 return nullptr;
2519 }
2520 } else if (ElemWidth == 4) {
2521 if (IndexWidth == 4) {
2522 // v32i32
2523 auto V6_vaslh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaslh);
2524 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2525 Builder, IntID: V6_vaslh, RetTy: NT, Args: {Indexes, HVC.getConstInt(Val: 2)});
2526 LLVM_DEBUG(dbgs()
2527 << " Shifted word index: " << *AdjustedIndex << ")\n");
2528 Gather = Builder.CreateIntrinsic(
2529 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermw_128B,
2530 Args: {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2531 AdjustedIndex},
2532 FMFSource: nullptr);
2533 } else {
2534 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2535 return nullptr;
2536 }
2537 } else {
2538 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2539 return nullptr;
2540 }
2541 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2542 // This is half of the reg width, duplicate low in high
2543 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2544 return nullptr;
2545 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2546 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2547 return nullptr;
2548 }
2549 // Erase the original intrinsic and store that consumes it.
2550 // HVX will create a pseudo for gather that is expanded to gather + store
2551 // during packetization.
2552 Dst->eraseFromParent();
2553 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2554 // Gather feeds directly into scatter.
2555 LLVM_DEBUG({
2556 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2557 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2558 unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2559 unsigned DstElements = HVC.length(DstInpTy);
2560 auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
2561 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2562 dbgs() << " Gather feeds into scatter\n Values to scatter : "
2563 << *Dst->getOperand(0) << "\n";
2564 dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements
2565 << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
2566 << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
2567 });
2568 // Address of source
2569 auto *Src = getPointer(Ptr: IndexLoad);
2570 if (!Src)
2571 return nullptr;
2572 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2573
2574 if (!isa<PointerType>(Val: Src->getType())) {
2575 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2576 return nullptr;
2577 }
2578
2579 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2580 V: Src, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2581 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2582
2583 auto *DstLoad = locateAddressFromIntrinsic(In: Dst);
2584 if (!DstLoad) {
2585 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2586 return nullptr;
2587 }
2588 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2589
2590 Value *Ptr = getPointer(Ptr: DstLoad);
2591 if (!Ptr)
2592 return nullptr;
2593 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2594 Value *CastIndex =
2595 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: IndexLoad);
2596 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2597 // Shift all i16 left by 1 to match short addressing mode instead of
2598 // byte.
2599 auto V6_vaslh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaslh);
2600 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2601 Builder, IntID: V6_vaslh, RetTy: NT, Args: {CastIndex, HVC.getConstInt(Val: 1)});
2602 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2603
2604 return Builder.CreateIntrinsic(
2605 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2606 Args: {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2607 AdjustedIndex},
2608 FMFSource: nullptr);
2609 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2610 // Gather feeds into previously inserted pseudo intrinsic.
2611 // These could not be in the same packet, so we need to generate another
2612 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2613 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2614 // ModRegs:$Mu, HvxVR:$Vv)
2615 if (isa<AllocaInst>(Val: IndexLoad)) {
2616 auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: Indexes);
2617 if (cstDataVector) {
2618 // Our indexes are represented as a constant. We need THEM in a reg.
2619 // This most likely will not work properly since alloca gives us DDR
2620 // stack location. This will be fixed once we teach compiler about VTCM.
2621 AllocaInst *IndexesAlloca = Builder.CreateAlloca(Ty: NT);
2622 [[maybe_unused]] auto *StoreIndexes =
2623 Builder.CreateStore(Val: cstDataVector, Ptr: IndexesAlloca);
2624 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2625 Value *LoadedIndex =
2626 Builder.CreateLoad(Ty: NT, Ptr: IndexesAlloca, Name: "reload_index");
2627 AllocaInst *ResultAlloca = Builder.CreateAlloca(Ty: NT);
2628 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2629
2630 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2631 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2632 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2633
2634 Gather = Builder.CreateIntrinsic(
2635 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2636 Args: {ResultAlloca, CastedSrc,
2637 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2638 FMFSource: nullptr);
2639 Value *LoadedResult = Builder.CreateLoad(
2640 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr: ResultAlloca, Name: "temp_result");
2641 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2642 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2643 In.replaceAllUsesWith(V: LoadedResult);
2644 }
2645 } else {
2646 // Address of source
2647 auto *Src = getPointer(Ptr: IndexLoad);
2648 if (!Src)
2649 return nullptr;
2650 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2651
2652 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2653 V: Src, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2654 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2655
2656 auto *DstLoad = locateAddressFromIntrinsic(In: Dst);
2657 if (!DstLoad)
2658 return nullptr;
2659 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2660 auto *Ptr = getPointer(Ptr: DstLoad);
2661 if (!Ptr)
2662 return nullptr;
2663 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2664
2665 Gather = Builder.CreateIntrinsic(
2666 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgather_vscattermh,
2667 Args: {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2668 Indexes},
2669 FMFSource: nullptr);
2670 }
2671 return Gather;
2672 } else if (Qual == HvxIdioms::HEX_Scatter) {
2673 // This is the case when result of a gather is used as an argument to
2674 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2675 // ourselves. We have to create alloca, store to it, and replace all uses
2676 // with that.
2677 AllocaInst *ResultAlloca = Builder.CreateAlloca(Ty: NT);
2678 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2679 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2680 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2681 Value *CastIndex =
2682 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, I: Indexes);
2683 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2684
2685 Gather = Builder.CreateIntrinsic(
2686 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2687 Args: {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2688 CastIndex},
2689 FMFSource: nullptr);
2690 Value *LoadedResult = Builder.CreateLoad(
2691 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr: ResultAlloca, Name: "temp_result");
2692 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2693 In.replaceAllUsesWith(V: LoadedResult);
2694 } else if (Qual == HvxIdioms::HEX_Gather) {
2695 // Gather feeds to another gather but already replaced with
2696 // hexagon_V6_vgathermh_128B
2697 if (isa<AllocaInst>(Val: IndexLoad)) {
2698 auto *cstDataVector = dyn_cast<ConstantDataVector>(Val: Indexes);
2699 if (cstDataVector) {
2700 // Our indexes are represented as a constant. We need it in a reg.
2701 AllocaInst *IndexesAlloca = Builder.CreateAlloca(Ty: NT);
2702
2703 [[maybe_unused]] auto *StoreIndexes =
2704 Builder.CreateStore(Val: cstDataVector, Ptr: IndexesAlloca);
2705 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2706 Value *LoadedIndex =
2707 Builder.CreateLoad(Ty: NT, Ptr: IndexesAlloca, Name: "reload_index");
2708 AllocaInst *ResultAlloca = Builder.CreateAlloca(Ty: NT);
2709 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2710 << "\n AddressSpace: "
2711 << ResultAlloca->getAddressSpace() << "\n";);
2712
2713 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2714 V: IndexLoad, DestTy: Type::getInt32Ty(C&: Ctx), Name: "cst_ptr_to_i32");
2715 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2716
2717 Gather = Builder.CreateIntrinsic(
2718 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::hexagon_V6_vgathermh_128B,
2719 Args: {ResultAlloca, CastedSrc,
2720 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2721 FMFSource: nullptr);
2722 Value *LoadedResult = Builder.CreateLoad(
2723 Ty: HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), Pair: false), Ptr: ResultAlloca, Name: "temp_result");
2724 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2725 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2726 In.replaceAllUsesWith(V: LoadedResult);
2727 }
2728 }
2729 } else if (Qual == HvxIdioms::LLVM_Gather) {
2730 // Gather feeds into another gather
2731 errs() << " Underimplemented vgather to vgather sequence\n";
2732 return nullptr;
2733 } else
2734 llvm_unreachable("Unhandled Qual enum");
2735
2736 return Gather;
2737}
2738
2739// Go through all PHI incomming values and find minimal alignment for non GEP
2740// members.
2741std::optional<uint64_t> HvxIdioms::getPHIBaseMinAlignment(Instruction &In,
2742 PHINode *PN) const {
2743 if (!PN)
2744 return std::nullopt;
2745
2746 SmallVector<Value *, 16> Worklist;
2747 SmallPtrSet<Value *, 16> Visited;
2748 uint64_t minPHIAlignment = Value::MaximumAlignment;
2749 Worklist.push_back(Elt: PN);
2750
2751 while (!Worklist.empty()) {
2752 Value *V = Worklist.back();
2753 Worklist.pop_back();
2754 if (!Visited.insert(Ptr: V).second)
2755 continue;
2756
2757 if (PHINode *PN = dyn_cast<PHINode>(Val: V)) {
2758 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2759 Worklist.push_back(Elt: PN->getIncomingValue(i));
2760 }
2761 } else if (isa<GetElementPtrInst>(Val: V)) {
2762 // Ignore geps for now.
2763 continue;
2764 } else {
2765 Align KnownAlign = getKnownAlignment(V, DL: HVC.DL, CxtI: &In, AC: &HVC.AC, DT: &HVC.DT);
2766 if (KnownAlign.value() < minPHIAlignment)
2767 minPHIAlignment = KnownAlign.value();
2768 }
2769 }
2770 if (minPHIAlignment != Value::MaximumAlignment)
2771 return minPHIAlignment;
2772 return std::nullopt;
2773}
2774
2775// Helper function to discover alignment for a ptr.
2776std::optional<uint64_t> HvxIdioms::getAlignment(Instruction &In,
2777 Value *ptr) const {
2778 SmallPtrSet<Value *, 16> Visited;
2779 return getAlignmentImpl(In, ptr, Visited);
2780}
2781
2782std::optional<uint64_t>
2783HvxIdioms::getAlignmentImpl(Instruction &In, Value *ptr,
2784 SmallPtrSet<Value *, 16> &Visited) const {
2785 LLVM_DEBUG(dbgs() << "[getAlignment] for : " << *ptr << "\n");
2786 // Prevent infinite recursion
2787 if (!Visited.insert(Ptr: ptr).second)
2788 return std::nullopt;
2789 // Try AssumptionCache.
2790 Align KnownAlign = getKnownAlignment(V: ptr, DL: HVC.DL, CxtI: &In, AC: &HVC.AC, DT: &HVC.DT);
2791 // This is the most formal and reliable source of information.
2792 if (KnownAlign.value() > 1) {
2793 LLVM_DEBUG(dbgs() << " VC align(" << KnownAlign.value() << ")\n");
2794 return KnownAlign.value();
2795 }
2796
2797 // If it is a PHI try to iterate through inputs
2798 if (PHINode *PN = dyn_cast<PHINode>(Val: ptr)) {
2799 // See if we have a common base to which we know alignment.
2800 auto baseAlignmentOpt = getPHIBaseMinAlignment(In, PN);
2801 if (!baseAlignmentOpt)
2802 return std::nullopt;
2803
2804 uint64_t minBaseAlignment = *baseAlignmentOpt;
2805 // If it is 1, there is no point to keep on looking.
2806 if (minBaseAlignment == 1)
2807 return 1;
2808 // No see if all other incomming phi nodes are just loop carried constants.
2809 uint64_t minPHIAlignment = minBaseAlignment;
2810 LLVM_DEBUG(dbgs() << " It is a PHI with(" << PN->getNumIncomingValues()
2811 << ")nodes and min base aligned to (" << minBaseAlignment
2812 << ")\n");
2813 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2814 Value *IV = PN->getIncomingValue(i);
2815 // We have already looked at all other values.
2816 if (!isa<GetElementPtrInst>(Val: IV))
2817 continue;
2818 uint64_t MemberAlignment = Value::MaximumAlignment;
2819 if (auto res = getAlignment(In&: *PN, ptr: IV))
2820 MemberAlignment = *res;
2821 else
2822 return std::nullopt;
2823 // Adjust total PHI alignment.
2824 if (minPHIAlignment > MemberAlignment)
2825 minPHIAlignment = MemberAlignment;
2826 }
2827 LLVM_DEBUG(dbgs() << " total PHI alignment(" << minPHIAlignment << ")\n");
2828 return minPHIAlignment;
2829 }
2830
2831 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: ptr)) {
2832 auto *GEPPtr = GEP->getPointerOperand();
2833 // Only if this is the induction variable with const offset
2834 // Implicit assumption is that induction variable itself is a PHI
2835 if (&In == GEPPtr) {
2836 APInt Offset(HVC.DL.getPointerSizeInBits(
2837 AS: GEPPtr->getType()->getPointerAddressSpace()),
2838 0);
2839 if (GEP->accumulateConstantOffset(DL: HVC.DL, Offset)) {
2840 LLVM_DEBUG(dbgs() << " Induction GEP with const step of ("
2841 << Offset.getZExtValue() << ")\n");
2842 return Offset.getZExtValue();
2843 }
2844 }
2845 }
2846
2847 return std::nullopt;
2848}
2849
2850Value *HvxIdioms::processMStore(Instruction &In) const {
2851 [[maybe_unused]] auto *InpTy =
2852 dyn_cast<VectorType>(Val: In.getOperand(i: 0)->getType());
2853 assert(InpTy && "Cannot handle no vector type for llvm.masked.store");
2854
2855 LLVM_DEBUG(dbgs() << "\n[Process mstore](" << In << ")\n"
2856 << *In.getParent() << "\n");
2857 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2858 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2859 << ") type(" << *InpTy->getElementType() << ") of size("
2860 << InpTy->getScalarSizeInBits() << ")bits\n");
2861 auto *CI = dyn_cast<CallBase>(Val: &In);
2862 assert(CI && "Expected llvm.masked.store to be a call");
2863 Align HaveAlign = CI->getParamAlign(ArgNo: 1).valueOrOne();
2864
2865 uint64_t KA = 1;
2866 if (auto res = getAlignment(In, ptr: In.getOperand(i: 1))) // ptr operand
2867 KA = *res;
2868 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2869 << KA << ")\n");
2870 // Normalize 0 -> ABI alignment of the stored value type (operand 0).
2871 Type *ValTy = In.getOperand(i: 0)->getType();
2872 Align EffA =
2873 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(Ty: ValTy).value());
2874
2875 if (EffA < HaveAlign)
2876 return nullptr;
2877
2878 // Attach/replace the param attribute on pointer param #1.
2879 AttrBuilder AttrB(CI->getContext());
2880 AttrB.addAlignmentAttr(Align: EffA);
2881 CI->setAttributes(
2882 CI->getAttributes().addParamAttributes(C&: CI->getContext(), ArgNo: 1, B: AttrB));
2883 return CI;
2884}
2885
2886Value *HvxIdioms::processMLoad(Instruction &In) const {
2887 [[maybe_unused]] auto *InpTy = dyn_cast<VectorType>(Val: In.getType());
2888 assert(InpTy && "Cannot handle non vector type for llvm.masked.store");
2889 LLVM_DEBUG(dbgs() << "\n[Process mload](" << In << ")\n"
2890 << *In.getParent() << "\n");
2891 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2892 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2893 << ") type(" << *InpTy->getElementType() << ") of size("
2894 << InpTy->getScalarSizeInBits() << ")bits\n");
2895 auto *CI = dyn_cast<CallBase>(Val: &In);
2896 assert(CI && "Expected to be a call to llvm.masked.load");
2897 // The pointer is operand #0, and its param attribute index is also 0.
2898 Align HaveAlign = CI->getParamAlign(ArgNo: 0).valueOrOne();
2899
2900 // Compute best-known alignment KA from analysis.
2901 uint64_t KA = 1;
2902 if (auto res = getAlignment(In, ptr: In.getOperand(i: 0))) // ptr operand
2903 KA = *res;
2904
2905 // Normalize 0 → ABI alignment of the loaded value type.
2906 Type *ValTy = In.getType();
2907 Align EffA =
2908 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(Ty: ValTy).value());
2909 if (EffA < HaveAlign)
2910 return nullptr;
2911 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2912 << KA << ")\n");
2913
2914 // Attach/replace the param attribute on pointer param #0.
2915 AttrBuilder AttrB(CI->getContext());
2916 AttrB.addAlignmentAttr(Align: EffA);
2917 CI->setAttributes(
2918 CI->getAttributes().addParamAttributes(C&: CI->getContext(), ArgNo: 0, B: AttrB));
2919 return CI;
2920}
2921
2922auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2923 const FxpOp &Op) const -> Value * {
2924 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2925 auto *InpTy = cast<VectorType>(Val: Op.X.Val->getType());
2926 unsigned Width = InpTy->getScalarSizeInBits();
2927 bool Rounding = Op.RoundAt.has_value();
2928
2929 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2930 // The fixed-point intrinsics do signed multiplication.
2931 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2932 Value *QMul = nullptr;
2933 if (Width == 16) {
2934 QMul = createMulQ15(Builder, X: Op.X, Y: Op.Y, Rounding);
2935 } else if (Width == 32) {
2936 QMul = createMulQ31(Builder, X: Op.X, Y: Op.Y, Rounding);
2937 }
2938 if (QMul != nullptr)
2939 return QMul;
2940 }
2941 }
2942
2943 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
2944 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
2945
2946 // If Width < 32, then it should really be 16.
2947 if (Width < 32) {
2948 if (Width < 16)
2949 return nullptr;
2950 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
2951 // generate a full precision products, which is unnecessary if there is
2952 // no shift.
2953 assert(Width == 16);
2954 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
2955 if (Op.Frac == 16) {
2956 // Multiply high
2957 if (Value *MulH = createMulH16(Builder, X: Op.X, Y: Op.Y))
2958 return MulH;
2959 }
2960 // Do full-precision multiply and shift.
2961 Value *Prod32 = createMul16(Builder, X: Op.X, Y: Op.Y);
2962 if (Rounding) {
2963 Value *RoundVal =
2964 ConstantInt::get(Ty: Prod32->getType(), V: 1ull << *Op.RoundAt);
2965 Prod32 = Builder.CreateAdd(LHS: Prod32, RHS: RoundVal, Name: "add");
2966 }
2967
2968 Value *ShiftAmt = ConstantInt::get(Ty: Prod32->getType(), V: Op.Frac);
2969 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
2970 ? Builder.CreateAShr(LHS: Prod32, RHS: ShiftAmt, Name: "asr")
2971 : Builder.CreateLShr(LHS: Prod32, RHS: ShiftAmt, Name: "lsr");
2972 return Builder.CreateTrunc(V: Shifted, DestTy: InpTy, Name: "trn");
2973 }
2974
2975 // Width >= 32
2976
2977 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
2978 // in preparation of doing the multiplication by 32-bit parts.
2979 auto WordX = HVC.splitVectorElements(Builder, Vec: Op.X.Val, /*ToWidth=*/32);
2980 auto WordY = HVC.splitVectorElements(Builder, Vec: Op.Y.Val, /*ToWidth=*/32);
2981 auto WordP = createMulLong(Builder, WordX, SgnX: Op.X.Sgn, WordY, SgnY: Op.Y.Sgn);
2982
2983 auto *HvxWordTy = cast<VectorType>(Val: WordP.front()->getType());
2984
2985 // Add the optional rounding to the proper word.
2986 if (Op.RoundAt.has_value()) {
2987 Value *Zero = Constant::getNullValue(Ty: WordX[0]->getType());
2988 SmallVector<Value *> RoundV(WordP.size(), Zero);
2989 RoundV[*Op.RoundAt / 32] =
2990 ConstantInt::get(Ty: HvxWordTy, V: 1ull << (*Op.RoundAt % 32));
2991 WordP = createAddLong(Builder, WordX: WordP, WordY: RoundV);
2992 }
2993
2994 // createRightShiftLong?
2995
2996 // Shift all products right by Op.Frac.
2997 unsigned SkipWords = Op.Frac / 32;
2998 Constant *ShiftAmt = ConstantInt::get(Ty: HvxWordTy, V: Op.Frac % 32);
2999
3000 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
3001 int Src = Dst + SkipWords;
3002 Value *Lo = WordP[Src];
3003 if (Src + 1 < End) {
3004 Value *Hi = WordP[Src + 1];
3005 WordP[Dst] = Builder.CreateIntrinsic(RetTy: HvxWordTy, ID: Intrinsic::fshr,
3006 Args: {Hi, Lo, ShiftAmt},
3007 /*FMFSource*/ nullptr, Name: "int");
3008 } else {
3009 // The shift of the most significant word.
3010 WordP[Dst] = Builder.CreateAShr(LHS: Lo, RHS: ShiftAmt, Name: "asr");
3011 }
3012 }
3013 if (SkipWords != 0)
3014 WordP.resize(N: WordP.size() - SkipWords);
3015
3016 return HVC.joinVectorElements(Builder, Values: WordP, ToType: Op.ResTy);
3017}
3018
3019auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
3020 bool Rounding) const -> Value * {
3021 assert(X.Val->getType() == Y.Val->getType());
3022 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
3023 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
3024
3025 // There is no non-rounding intrinsic for i16.
3026 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
3027 return nullptr;
3028
3029 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyhvsrs);
3030 return HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyhvsrs, RetTy: X.Val->getType(),
3031 Args: {X.Val, Y.Val});
3032}
3033
3034auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
3035 bool Rounding) const -> Value * {
3036 Type *InpTy = X.Val->getType();
3037 assert(InpTy == Y.Val->getType());
3038 assert(InpTy->getScalarType() == HVC.getIntTy(32));
3039 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
3040
3041 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
3042 return nullptr;
3043
3044 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyewuh);
3045 auto V6_vmpyo_acc = Rounding
3046 ? HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyowh_rnd_sacc)
3047 : HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyowh_sacc);
3048 Value *V1 =
3049 HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyewuh, RetTy: InpTy, Args: {X.Val, Y.Val});
3050 return HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyo_acc, RetTy: InpTy,
3051 Args: {V1, X.Val, Y.Val});
3052}
3053
3054auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
3055 Value *CarryIn) const
3056 -> std::pair<Value *, Value *> {
3057 assert(X->getType() == Y->getType());
3058 auto VecTy = cast<VectorType>(Val: X->getType());
3059 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
3060 SmallVector<Value *> Args = {X, Y};
3061 Intrinsic::ID AddCarry;
3062 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
3063 AddCarry = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaddcarryo);
3064 } else {
3065 AddCarry = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vaddcarry);
3066 if (CarryIn == nullptr)
3067 CarryIn = Constant::getNullValue(Ty: HVC.getBoolTy(ElemCount: HVC.length(Ty: VecTy)));
3068 Args.push_back(Elt: CarryIn);
3069 }
3070 Value *Ret = HVC.createHvxIntrinsic(Builder, IntID: AddCarry,
3071 /*RetTy=*/nullptr, Args);
3072 Value *Result = Builder.CreateExtractValue(Agg: Ret, Idxs: {0}, Name: "ext");
3073 Value *CarryOut = Builder.CreateExtractValue(Agg: Ret, Idxs: {1}, Name: "ext");
3074 return {Result, CarryOut};
3075 }
3076
3077 // In other cases, do a regular add, and unsigned compare-less-than.
3078 // The carry-out can originate in two places: adding the carry-in or adding
3079 // the two input values.
3080 Value *Result1 = X; // Result1 = X + CarryIn
3081 if (CarryIn != nullptr) {
3082 unsigned Width = VecTy->getScalarSizeInBits();
3083 uint32_t Mask = 1;
3084 if (Width < 32) {
3085 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
3086 Mask = (Mask << Width) | 1;
3087 }
3088 auto V6_vandqrt = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vandqrt);
3089 Value *ValueIn =
3090 HVC.createHvxIntrinsic(Builder, IntID: V6_vandqrt, /*RetTy=*/nullptr,
3091 Args: {CarryIn, HVC.getConstInt(Val: Mask)});
3092 Result1 = Builder.CreateAdd(LHS: X, RHS: ValueIn, Name: "add");
3093 }
3094
3095 Value *CarryOut1 = Builder.CreateCmp(Pred: CmpInst::ICMP_ULT, LHS: Result1, RHS: X, Name: "cmp");
3096 Value *Result2 = Builder.CreateAdd(LHS: Result1, RHS: Y, Name: "add");
3097 Value *CarryOut2 = Builder.CreateCmp(Pred: CmpInst::ICMP_ULT, LHS: Result2, RHS: Y, Name: "cmp");
3098 return {Result2, Builder.CreateOr(LHS: CarryOut1, RHS: CarryOut2, Name: "orb")};
3099}
3100
3101auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
3102 -> Value * {
3103 Intrinsic::ID V6_vmpyh = 0;
3104 std::tie(args&: X, args&: Y) = canonSgn(X, Y);
3105
3106 if (X.Sgn == Signed) {
3107 V6_vmpyh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyhv);
3108 } else if (Y.Sgn == Signed) {
3109 // In vmpyhus the second operand is unsigned
3110 V6_vmpyh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyhus);
3111 } else {
3112 V6_vmpyh = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyuhv);
3113 }
3114
3115 // i16*i16 -> i32 / interleaved
3116 Value *P =
3117 HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyh, RetTy: HvxP32Ty, Args: {Y.Val, X.Val});
3118 // Deinterleave
3119 return HVC.vshuff(Builder, Val0: HVC.sublo(Builder, Val: P), Val1: HVC.subhi(Builder, Val: P));
3120}
3121
3122auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
3123 -> Value * {
3124 Type *HvxI16Ty = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), /*Pair=*/false);
3125
3126 if (HVC.HST.useHVXV69Ops()) {
3127 if (X.Sgn != Signed && Y.Sgn != Signed) {
3128 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Opc: Hexagon::V6_vmpyuhvs);
3129 return HVC.createHvxIntrinsic(Builder, IntID: V6_vmpyuhvs, RetTy: HvxI16Ty,
3130 Args: {X.Val, Y.Val});
3131 }
3132 }
3133
3134 Type *HvxP16Ty = HVC.getHvxTy(ElemTy: HVC.getIntTy(Width: 16), /*Pair=*/true);
3135 Value *Pair16 =
3136 Builder.CreateBitCast(V: createMul16(Builder, X, Y), DestTy: HvxP16Ty, Name: "cst");
3137 unsigned Len = HVC.length(Ty: HvxP16Ty) / 2;
3138
3139 SmallVector<int, 128> PickOdd(Len);
3140 for (int i = 0; i != static_cast<int>(Len); ++i)
3141 PickOdd[i] = 2 * i + 1;
3142
3143 return Builder.CreateShuffleVector(
3144 V1: HVC.sublo(Builder, Val: Pair16), V2: HVC.subhi(Builder, Val: Pair16), Mask: PickOdd, Name: "shf");
3145}
3146
3147auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
3148 -> std::pair<Value *, Value *> {
3149 assert(X.Val->getType() == Y.Val->getType());
3150 assert(X.Val->getType() == HvxI32Ty);
3151
3152 Intrinsic::ID V6_vmpy_parts;
3153 std::tie(args&: X, args&: Y) = canonSgn(X, Y);
3154
3155 if (X.Sgn == Signed) {
3156 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
3157 } else if (Y.Sgn == Signed) {
3158 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
3159 } else {
3160 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
3161 }
3162
3163 Value *Parts = HVC.createHvxIntrinsic(Builder, IntID: V6_vmpy_parts, RetTy: nullptr,
3164 Args: {X.Val, Y.Val}, ArgTys: {HvxI32Ty});
3165 Value *Hi = Builder.CreateExtractValue(Agg: Parts, Idxs: {0}, Name: "ext");
3166 Value *Lo = Builder.CreateExtractValue(Agg: Parts, Idxs: {1}, Name: "ext");
3167 return {Lo, Hi};
3168}
3169
3170auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3171 ArrayRef<Value *> WordY) const
3172 -> SmallVector<Value *> {
3173 assert(WordX.size() == WordY.size());
3174 unsigned Idx = 0, Length = WordX.size();
3175 SmallVector<Value *> Sum(Length);
3176
3177 while (Idx != Length) {
3178 if (HVC.isZero(Val: WordX[Idx]))
3179 Sum[Idx] = WordY[Idx];
3180 else if (HVC.isZero(Val: WordY[Idx]))
3181 Sum[Idx] = WordX[Idx];
3182 else
3183 break;
3184 ++Idx;
3185 }
3186
3187 Value *Carry = nullptr;
3188 for (; Idx != Length; ++Idx) {
3189 std::tie(args&: Sum[Idx], args&: Carry) =
3190 createAddCarry(Builder, X: WordX[Idx], Y: WordY[Idx], CarryIn: Carry);
3191 }
3192
3193 // This drops the final carry beyond the highest word.
3194 return Sum;
3195}
3196
3197auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3198 Signedness SgnX, ArrayRef<Value *> WordY,
3199 Signedness SgnY) const -> SmallVector<Value *> {
3200 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
3201
3202 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
3203 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
3204 for (int i = 0, e = WordX.size(); i != e; ++i) {
3205 for (int j = 0, f = WordY.size(); j != f; ++j) {
3206 // Check the 4 halves that this multiplication can generate.
3207 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
3208 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
3209 auto [Lo, Hi] = createMul32(Builder, X: {.Val: WordX[i], .Sgn: SX}, Y: {.Val: WordY[j], .Sgn: SY});
3210 Products[i + j + 0].push_back(Elt: Lo);
3211 Products[i + j + 1].push_back(Elt: Hi);
3212 }
3213 }
3214
3215 Value *Zero = Constant::getNullValue(Ty: WordX[0]->getType());
3216
3217 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
3218 if (Vector.empty())
3219 return Zero;
3220 auto Last = Vector.back();
3221 Vector.pop_back();
3222 return Last;
3223 };
3224
3225 for (int i = 0, e = Products.size(); i != e; ++i) {
3226 while (Products[i].size() > 1) {
3227 Value *Carry = nullptr; // no carry-in
3228 for (int j = i; j != e; ++j) {
3229 auto &ProdJ = Products[j];
3230 auto [Sum, CarryOut] = createAddCarry(Builder, X: pop_back_or_zero(ProdJ),
3231 Y: pop_back_or_zero(ProdJ), CarryIn: Carry);
3232 ProdJ.insert(I: ProdJ.begin(), Elt: Sum);
3233 Carry = CarryOut;
3234 }
3235 }
3236 }
3237
3238 SmallVector<Value *> WordP;
3239 for (auto &P : Products) {
3240 assert(P.size() == 1 && "Should have been added together");
3241 WordP.push_back(Elt: P.front());
3242 }
3243
3244 return WordP;
3245}
3246
3247auto HvxIdioms::run() -> bool {
3248 bool Changed = false;
3249
3250 for (BasicBlock &B : HVC.F) {
3251 for (auto It = B.rbegin(); It != B.rend(); ++It) {
3252 if (auto Fxm = matchFxpMul(In&: *It)) {
3253 Value *New = processFxpMul(In&: *It, Op: *Fxm);
3254 // Always report "changed" for now.
3255 Changed = true;
3256 if (!New)
3257 continue;
3258 bool StartOver = !isa<Instruction>(Val: New);
3259 It->replaceAllUsesWith(V: New);
3260 RecursivelyDeleteTriviallyDeadInstructions(V: &*It, TLI: &HVC.TLI);
3261 It = StartOver ? B.rbegin()
3262 : cast<Instruction>(Val: New)->getReverseIterator();
3263 Changed = true;
3264 } else if (matchGather(In&: *It)) {
3265 Value *New = processVGather(In&: *It);
3266 if (!New)
3267 continue;
3268 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3269 // We replace original intrinsic with a new pseudo call.
3270 It->eraseFromParent();
3271 It = cast<Instruction>(Val: New)->getReverseIterator();
3272 RecursivelyDeleteTriviallyDeadInstructions(V: &*It, TLI: &HVC.TLI);
3273 Changed = true;
3274 } else if (matchScatter(In&: *It)) {
3275 Value *New = processVScatter(In&: *It);
3276 if (!New)
3277 continue;
3278 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3279 // We replace original intrinsic with a new pseudo call.
3280 It->eraseFromParent();
3281 It = cast<Instruction>(Val: New)->getReverseIterator();
3282 RecursivelyDeleteTriviallyDeadInstructions(V: &*It, TLI: &HVC.TLI);
3283 Changed = true;
3284 } else if (matchMLoad(In&: *It)) {
3285 Value *New = processMLoad(In&: *It);
3286 if (!New)
3287 continue;
3288 LLVM_DEBUG(dbgs() << " MLoad : " << *New << "\n");
3289 Changed = true;
3290 } else if (matchMStore(In&: *It)) {
3291 Value *New = processMStore(In&: *It);
3292 if (!New)
3293 continue;
3294 LLVM_DEBUG(dbgs() << " MStore : " << *New << "\n");
3295 Changed = true;
3296 }
3297 }
3298 }
3299
3300 return Changed;
3301}
3302
3303// --- End HvxIdioms
3304
3305auto HexagonVectorCombine::run() -> bool {
3306 if (DumpModule)
3307 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3308
3309 bool Changed = false;
3310 if (HST.useHVXOps()) {
3311 if (VAEnabled)
3312 Changed |= AlignVectors(*this).run();
3313 if (VIEnabled)
3314 Changed |= HvxIdioms(*this).run();
3315 }
3316
3317 if (DumpModule) {
3318 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3319 << " after HexagonVectorCombine\n"
3320 << *F.getParent();
3321 }
3322 return Changed;
3323}
3324
3325auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3326 return IntegerType::get(C&: F.getContext(), NumBits: Width);
3327}
3328
3329auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3330 assert(ElemCount >= 0);
3331 IntegerType *ByteTy = Type::getInt8Ty(C&: F.getContext());
3332 if (ElemCount == 0)
3333 return ByteTy;
3334 return VectorType::get(ElementType: ByteTy, NumElements: ElemCount, /*Scalable=*/false);
3335}
3336
3337auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3338 assert(ElemCount >= 0);
3339 IntegerType *BoolTy = Type::getInt1Ty(C&: F.getContext());
3340 if (ElemCount == 0)
3341 return BoolTy;
3342 return VectorType::get(ElementType: BoolTy, NumElements: ElemCount, /*Scalable=*/false);
3343}
3344
3345auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3346 -> ConstantInt * {
3347 return ConstantInt::getSigned(Ty: getIntTy(Width), V: Val);
3348}
3349
3350auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3351 if (auto *C = dyn_cast<Constant>(Val))
3352 return C->isZeroValue();
3353 return false;
3354}
3355
3356auto HexagonVectorCombine::getIntValue(const Value *Val) const
3357 -> std::optional<APInt> {
3358 if (auto *CI = dyn_cast<ConstantInt>(Val))
3359 return CI->getValue();
3360 return std::nullopt;
3361}
3362
3363auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3364 return isa<UndefValue>(Val);
3365}
3366
3367auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3368 return Val == ConstantInt::getTrue(Ty: Val->getType());
3369}
3370
3371auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3372 return isZero(Val);
3373}
3374
3375auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3376 -> VectorType * {
3377 EVT ETy = EVT::getEVT(Ty: ElemTy, HandleUnknown: false);
3378 assert(ETy.isSimple() && "Invalid HVX element type");
3379 // Do not allow boolean types here: they don't have a fixed length.
3380 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3381 "Invalid HVX element type");
3382 unsigned HwLen = HST.getVectorLength();
3383 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3384 return VectorType::get(ElementType: ElemTy, NumElements: Pair ? 2 * NumElems : NumElems,
3385 /*Scalable=*/false);
3386}
3387
3388auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3389 -> int {
3390 return getSizeOf(Ty: Val->getType(), Kind);
3391}
3392
3393auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3394 -> int {
3395 auto *NcTy = const_cast<Type *>(Ty);
3396 switch (Kind) {
3397 case Store:
3398 return DL.getTypeStoreSize(Ty: NcTy).getFixedValue();
3399 case Alloc:
3400 return DL.getTypeAllocSize(Ty: NcTy).getFixedValue();
3401 }
3402 llvm_unreachable("Unhandled SizeKind enum");
3403}
3404
3405auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3406 // The actual type may be shorter than the HVX vector, so determine
3407 // the alignment based on subtarget info.
3408 if (HST.isTypeForHVX(VecTy: Ty))
3409 return HST.getVectorLength();
3410 return DL.getABITypeAlign(Ty).value();
3411}
3412
3413auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3414 return length(Ty: Val->getType());
3415}
3416
3417auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3418 auto *VecTy = dyn_cast<VectorType>(Val: Ty);
3419 assert(VecTy && "Must be a vector type");
3420 return VecTy->getElementCount().getFixedValue();
3421}
3422
3423auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3424 if (auto *In = dyn_cast<Instruction>(Val: V)) {
3425 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3426 return simplifyInstruction(I: In, Q);
3427 }
3428 return nullptr;
3429}
3430
3431// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3432auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3433 Value *Src, int Start, int Length,
3434 int Where) const -> Value * {
3435 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3436 int SrcLen = getSizeOf(Val: Src);
3437 int DstLen = getSizeOf(Val: Dst);
3438 assert(0 <= Start && Start + Length <= SrcLen);
3439 assert(0 <= Where && Where + Length <= DstLen);
3440
3441 int P2Len = PowerOf2Ceil(A: SrcLen | DstLen);
3442 auto *Poison = PoisonValue::get(T: getByteTy());
3443 Value *P2Src = vresize(Builder, Val: Src, NewSize: P2Len, Pad: Poison);
3444 Value *P2Dst = vresize(Builder, Val: Dst, NewSize: P2Len, Pad: Poison);
3445
3446 SmallVector<int, 256> SMask(P2Len);
3447 for (int i = 0; i != P2Len; ++i) {
3448 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3449 // Otherwise, pick Dst[i];
3450 SMask[i] =
3451 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3452 }
3453
3454 Value *P2Insert = Builder.CreateShuffleVector(V1: P2Dst, V2: P2Src, Mask: SMask, Name: "shf");
3455 return vresize(Builder, Val: P2Insert, NewSize: DstLen, Pad: Poison);
3456}
3457
3458auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3459 Value *Hi, Value *Amt) const -> Value * {
3460 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3461 if (isZero(Val: Amt))
3462 return Hi;
3463 int VecLen = getSizeOf(Val: Hi);
3464 if (auto IntAmt = getIntValue(Val: Amt))
3465 return getElementRange(Builder, Lo, Hi, Start: VecLen - IntAmt->getSExtValue(),
3466 Length: VecLen);
3467
3468 if (HST.isTypeForHVX(VecTy: Hi->getType())) {
3469 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3470 "Expecting an exact HVX type");
3471 return createHvxIntrinsic(Builder, IntID: HST.getIntrinsicId(Opc: Hexagon::V6_vlalignb),
3472 RetTy: Hi->getType(), Args: {Hi, Lo, Amt});
3473 }
3474
3475 if (VecLen == 4) {
3476 Value *Pair = concat(Builder, Vecs: {Lo, Hi});
3477 Value *Shift =
3478 Builder.CreateLShr(LHS: Builder.CreateShl(LHS: Pair, RHS: Amt, Name: "shl"), RHS: 32, Name: "lsr");
3479 Value *Trunc =
3480 Builder.CreateTrunc(V: Shift, DestTy: Type::getInt32Ty(C&: F.getContext()), Name: "trn");
3481 return Builder.CreateBitCast(V: Trunc, DestTy: Hi->getType(), Name: "cst");
3482 }
3483 if (VecLen == 8) {
3484 Value *Sub = Builder.CreateSub(LHS: getConstInt(Val: VecLen), RHS: Amt, Name: "sub");
3485 return vralignb(Builder, Lo, Hi, Amt: Sub);
3486 }
3487 llvm_unreachable("Unexpected vector length");
3488}
3489
3490auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3491 Value *Hi, Value *Amt) const -> Value * {
3492 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3493 if (isZero(Val: Amt))
3494 return Lo;
3495 int VecLen = getSizeOf(Val: Lo);
3496 if (auto IntAmt = getIntValue(Val: Amt))
3497 return getElementRange(Builder, Lo, Hi, Start: IntAmt->getSExtValue(), Length: VecLen);
3498
3499 if (HST.isTypeForHVX(VecTy: Lo->getType())) {
3500 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3501 "Expecting an exact HVX type");
3502 return createHvxIntrinsic(Builder, IntID: HST.getIntrinsicId(Opc: Hexagon::V6_valignb),
3503 RetTy: Lo->getType(), Args: {Hi, Lo, Amt});
3504 }
3505
3506 if (VecLen == 4) {
3507 Value *Pair = concat(Builder, Vecs: {Lo, Hi});
3508 Value *Shift = Builder.CreateLShr(LHS: Pair, RHS: Amt, Name: "lsr");
3509 Value *Trunc =
3510 Builder.CreateTrunc(V: Shift, DestTy: Type::getInt32Ty(C&: F.getContext()), Name: "trn");
3511 return Builder.CreateBitCast(V: Trunc, DestTy: Lo->getType(), Name: "cst");
3512 }
3513 if (VecLen == 8) {
3514 Type *Int64Ty = Type::getInt64Ty(C&: F.getContext());
3515 Value *Lo64 = Builder.CreateBitCast(V: Lo, DestTy: Int64Ty, Name: "cst");
3516 Value *Hi64 = Builder.CreateBitCast(V: Hi, DestTy: Int64Ty, Name: "cst");
3517 Value *Call = Builder.CreateIntrinsic(ID: Intrinsic::hexagon_S2_valignrb,
3518 Args: {Hi64, Lo64, Amt},
3519 /*FMFSource=*/nullptr, Name: "cup");
3520 return Builder.CreateBitCast(V: Call, DestTy: Lo->getType(), Name: "cst");
3521 }
3522 llvm_unreachable("Unexpected vector length");
3523}
3524
3525// Concatenates a sequence of vectors of the same type.
3526auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3527 ArrayRef<Value *> Vecs) const -> Value * {
3528 assert(!Vecs.empty());
3529 SmallVector<int, 256> SMask;
3530 std::vector<Value *> Work[2];
3531 int ThisW = 0, OtherW = 1;
3532
3533 Work[ThisW].assign(first: Vecs.begin(), last: Vecs.end());
3534 while (Work[ThisW].size() > 1) {
3535 auto *Ty = cast<VectorType>(Val: Work[ThisW].front()->getType());
3536 SMask.resize(N: length(Ty) * 2);
3537 std::iota(first: SMask.begin(), last: SMask.end(), value: 0);
3538
3539 Work[OtherW].clear();
3540 if (Work[ThisW].size() % 2 != 0)
3541 Work[ThisW].push_back(x: UndefValue::get(T: Ty));
3542 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3543 Value *Joined = Builder.CreateShuffleVector(
3544 V1: Work[ThisW][i], V2: Work[ThisW][i + 1], Mask: SMask, Name: "shf");
3545 Work[OtherW].push_back(x: Joined);
3546 }
3547 std::swap(a&: ThisW, b&: OtherW);
3548 }
3549
3550 // Since there may have been some undefs appended to make shuffle operands
3551 // have the same type, perform the last shuffle to only pick the original
3552 // elements.
3553 SMask.resize(N: Vecs.size() * length(Ty: Vecs.front()->getType()));
3554 std::iota(first: SMask.begin(), last: SMask.end(), value: 0);
3555 Value *Total = Work[ThisW].front();
3556 return Builder.CreateShuffleVector(V: Total, Mask: SMask, Name: "shf");
3557}
3558
3559auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3560 int NewSize, Value *Pad) const -> Value * {
3561 assert(isa<VectorType>(Val->getType()));
3562 auto *ValTy = cast<VectorType>(Val: Val->getType());
3563 assert(ValTy->getElementType() == Pad->getType());
3564
3565 int CurSize = length(Ty: ValTy);
3566 if (CurSize == NewSize)
3567 return Val;
3568 // Truncate?
3569 if (CurSize > NewSize)
3570 return getElementRange(Builder, Lo: Val, /*Ignored*/ Hi: Val, Start: 0, Length: NewSize);
3571 // Extend.
3572 SmallVector<int, 128> SMask(NewSize);
3573 std::iota(first: SMask.begin(), last: SMask.begin() + CurSize, value: 0);
3574 std::fill(first: SMask.begin() + CurSize, last: SMask.end(), value: CurSize);
3575 Value *PadVec = Builder.CreateVectorSplat(NumElts: CurSize, V: Pad, Name: "spt");
3576 return Builder.CreateShuffleVector(V1: Val, V2: PadVec, Mask: SMask, Name: "shf");
3577}
3578
3579auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3580 Type *FromTy, Type *ToTy) const -> Value * {
3581 // Mask is a vector <N x i1>, where each element corresponds to an
3582 // element of FromTy. Remap it so that each element will correspond
3583 // to an element of ToTy.
3584 assert(isa<VectorType>(Mask->getType()));
3585
3586 Type *FromSTy = FromTy->getScalarType();
3587 Type *ToSTy = ToTy->getScalarType();
3588 if (FromSTy == ToSTy)
3589 return Mask;
3590
3591 int FromSize = getSizeOf(Ty: FromSTy);
3592 int ToSize = getSizeOf(Ty: ToSTy);
3593 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3594
3595 auto *MaskTy = cast<VectorType>(Val: Mask->getType());
3596 int FromCount = length(Ty: MaskTy);
3597 int ToCount = (FromCount * FromSize) / ToSize;
3598 assert((FromCount * FromSize) % ToSize == 0);
3599
3600 auto *FromITy = getIntTy(Width: FromSize * 8);
3601 auto *ToITy = getIntTy(Width: ToSize * 8);
3602
3603 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3604 // -> trunc to <M x i1>.
3605 Value *Ext = Builder.CreateSExt(
3606 V: Mask, DestTy: VectorType::get(ElementType: FromITy, NumElements: FromCount, /*Scalable=*/false), Name: "sxt");
3607 Value *Cast = Builder.CreateBitCast(
3608 V: Ext, DestTy: VectorType::get(ElementType: ToITy, NumElements: ToCount, /*Scalable=*/false), Name: "cst");
3609 return Builder.CreateTrunc(
3610 V: Cast, DestTy: VectorType::get(ElementType: getBoolTy(), NumElements: ToCount, /*Scalable=*/false), Name: "trn");
3611}
3612
3613// Bitcast to bytes, and return least significant bits.
3614auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3615 -> Value * {
3616 Type *ScalarTy = Val->getType()->getScalarType();
3617 if (ScalarTy == getBoolTy())
3618 return Val;
3619
3620 Value *Bytes = vbytes(Builder, Val);
3621 if (auto *VecTy = dyn_cast<VectorType>(Val: Bytes->getType()))
3622 return Builder.CreateTrunc(V: Bytes, DestTy: getBoolTy(ElemCount: getSizeOf(Ty: VecTy)), Name: "trn");
3623 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3624 // <1 x i1>.
3625 return Builder.CreateTrunc(V: Bytes, DestTy: getBoolTy(), Name: "trn");
3626}
3627
3628// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3629auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3630 -> Value * {
3631 Type *ScalarTy = Val->getType()->getScalarType();
3632 if (ScalarTy == getByteTy())
3633 return Val;
3634
3635 if (ScalarTy != getBoolTy())
3636 return Builder.CreateBitCast(V: Val, DestTy: getByteTy(ElemCount: getSizeOf(Val)), Name: "cst");
3637 // For bool, return a sext from i1 to i8.
3638 if (auto *VecTy = dyn_cast<VectorType>(Val: Val->getType()))
3639 return Builder.CreateSExt(V: Val, DestTy: VectorType::get(ElementType: getByteTy(), Other: VecTy), Name: "sxt");
3640 return Builder.CreateSExt(V: Val, DestTy: getByteTy(), Name: "sxt");
3641}
3642
3643auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3644 unsigned Start, unsigned Length) const
3645 -> Value * {
3646 assert(Start + Length <= length(Val));
3647 return getElementRange(Builder, Lo: Val, /*Ignored*/ Hi: Val, Start, Length);
3648}
3649
3650auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3651 -> Value * {
3652 size_t Len = length(Val);
3653 assert(Len % 2 == 0 && "Length should be even");
3654 return subvector(Builder, Val, Start: 0, Length: Len / 2);
3655}
3656
3657auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3658 -> Value * {
3659 size_t Len = length(Val);
3660 assert(Len % 2 == 0 && "Length should be even");
3661 return subvector(Builder, Val, Start: Len / 2, Length: Len / 2);
3662}
3663
3664auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3665 Value *Val1) const -> Value * {
3666 assert(Val0->getType() == Val1->getType());
3667 int Len = length(Val: Val0);
3668 SmallVector<int, 128> Mask(2 * Len);
3669
3670 for (int i = 0; i != Len; ++i) {
3671 Mask[i] = 2 * i; // Even
3672 Mask[i + Len] = 2 * i + 1; // Odd
3673 }
3674 return Builder.CreateShuffleVector(V1: Val0, V2: Val1, Mask, Name: "shf");
3675}
3676
3677auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3678 Value *Val1) const -> Value * { //
3679 assert(Val0->getType() == Val1->getType());
3680 int Len = length(Val: Val0);
3681 SmallVector<int, 128> Mask(2 * Len);
3682
3683 for (int i = 0; i != Len; ++i) {
3684 Mask[2 * i + 0] = i; // Val0
3685 Mask[2 * i + 1] = i + Len; // Val1
3686 }
3687 return Builder.CreateShuffleVector(V1: Val0, V2: Val1, Mask, Name: "shf");
3688}
3689
3690auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3691 Intrinsic::ID IntID, Type *RetTy,
3692 ArrayRef<Value *> Args,
3693 ArrayRef<Type *> ArgTys,
3694 ArrayRef<Value *> MDSources) const
3695 -> Value * {
3696 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3697 Type *DestTy) -> Value * {
3698 Type *SrcTy = Val->getType();
3699 if (SrcTy == DestTy)
3700 return Val;
3701
3702 // Non-HVX type. It should be a scalar, and it should already have
3703 // a valid type.
3704 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3705
3706 Type *BoolTy = Type::getInt1Ty(C&: F.getContext());
3707 if (cast<VectorType>(Val: SrcTy)->getElementType() != BoolTy)
3708 return Builder.CreateBitCast(V: Val, DestTy, Name: "cst");
3709
3710 // Predicate HVX vector.
3711 unsigned HwLen = HST.getVectorLength();
3712 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3713 : Intrinsic::hexagon_V6_pred_typecast_128B;
3714 return Builder.CreateIntrinsic(ID: TC, Types: {DestTy, Val->getType()}, Args: {Val},
3715 /*FMFSource=*/nullptr, Name: "cup");
3716 };
3717
3718 Function *IntrFn =
3719 Intrinsic::getOrInsertDeclaration(M: F.getParent(), id: IntID, Tys: ArgTys);
3720 FunctionType *IntrTy = IntrFn->getFunctionType();
3721
3722 SmallVector<Value *, 4> IntrArgs;
3723 for (int i = 0, e = Args.size(); i != e; ++i) {
3724 Value *A = Args[i];
3725 Type *T = IntrTy->getParamType(i);
3726 if (A->getType() != T) {
3727 IntrArgs.push_back(Elt: getCast(Builder, A, T));
3728 } else {
3729 IntrArgs.push_back(Elt: A);
3730 }
3731 }
3732 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3733 CallInst *Call = Builder.CreateCall(Callee: IntrFn, Args: IntrArgs, Name: MaybeName);
3734
3735 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3736 if (!ME.doesNotAccessMemory() && !ME.onlyAccessesInaccessibleMem())
3737 propagateMetadata(I: Call, VL: MDSources);
3738
3739 Type *CallTy = Call->getType();
3740 if (RetTy == nullptr || CallTy == RetTy)
3741 return Call;
3742 // Scalar types should have RetTy matching the call return type.
3743 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3744 return getCast(Builder, Call, RetTy);
3745}
3746
3747auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3748 Value *Vec,
3749 unsigned ToWidth) const
3750 -> SmallVector<Value *> {
3751 // Break a vector of wide elements into a series of vectors with narrow
3752 // elements:
3753 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3754 // -->
3755 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3756 // (b0, b1, b2, ...) // the next lowest...
3757 // (c0, c1, c2, ...) // ...
3758 // ...
3759 //
3760 // The number of elements in each resulting vector is the same as
3761 // in the original vector.
3762
3763 auto *VecTy = cast<VectorType>(Val: Vec->getType());
3764 assert(VecTy->getElementType()->isIntegerTy());
3765 unsigned FromWidth = VecTy->getScalarSizeInBits();
3766 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3767 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3768 unsigned NumResults = FromWidth / ToWidth;
3769
3770 SmallVector<Value *> Results(NumResults);
3771 Results[0] = Vec;
3772 unsigned Length = length(Ty: VecTy);
3773
3774 // Do it by splitting in half, since those operations correspond to deal
3775 // instructions.
3776 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3777 // Take V = Results[Begin], split it in L, H.
3778 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3779 // Call itself recursively split(Begin, Half), split(Half+1, End)
3780 if (Begin + 1 == End)
3781 return;
3782
3783 Value *Val = Results[Begin];
3784 unsigned Width = Val->getType()->getScalarSizeInBits();
3785
3786 auto *VTy = VectorType::get(ElementType: getIntTy(Width: Width / 2), NumElements: 2 * Length, Scalable: false);
3787 Value *VVal = Builder.CreateBitCast(V: Val, DestTy: VTy, Name: "cst");
3788
3789 Value *Res = vdeal(Builder, Val0: sublo(Builder, Val: VVal), Val1: subhi(Builder, Val: VVal));
3790
3791 unsigned Half = (Begin + End) / 2;
3792 Results[Begin] = sublo(Builder, Val: Res);
3793 Results[Half] = subhi(Builder, Val: Res);
3794
3795 splitFunc(Begin, Half, splitFunc);
3796 splitFunc(Half, End, splitFunc);
3797 };
3798
3799 splitInHalf(0, NumResults, splitInHalf);
3800 return Results;
3801}
3802
3803auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3804 ArrayRef<Value *> Values,
3805 VectorType *ToType) const
3806 -> Value * {
3807 assert(ToType->getElementType()->isIntegerTy());
3808
3809 // If the list of values does not have power-of-2 elements, append copies
3810 // of the sign bit to it, to make the size be 2^n.
3811 // The reason for this is that the values will be joined in pairs, because
3812 // otherwise the shuffles will result in convoluted code. With pairwise
3813 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3814 // The output will need to be sign-extended to a type with element width
3815 // being a power-of-2 anyways.
3816 SmallVector<Value *> Inputs(Values);
3817
3818 unsigned ToWidth = ToType->getScalarSizeInBits();
3819 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3820 assert(Width <= ToWidth);
3821 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3822 unsigned Length = length(Ty: Inputs.front()->getType());
3823
3824 unsigned NeedInputs = ToWidth / Width;
3825 if (Inputs.size() != NeedInputs) {
3826 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3827 // If there are too few, fill them with the sign bit.
3828 Value *Last = Inputs.back();
3829 Value *Sign = Builder.CreateAShr(
3830 LHS: Last, RHS: ConstantInt::get(Ty: Last->getType(), V: Width - 1), Name: "asr");
3831 Inputs.resize(N: NeedInputs, NV: Sign);
3832 }
3833
3834 while (Inputs.size() > 1) {
3835 Width *= 2;
3836 auto *VTy = VectorType::get(ElementType: getIntTy(Width), NumElements: Length, Scalable: false);
3837 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3838 Value *Res = vshuff(Builder, Val0: Inputs[i], Val1: Inputs[i + 1]);
3839 Inputs[i / 2] = Builder.CreateBitCast(V: Res, DestTy: VTy, Name: "cst");
3840 }
3841 Inputs.resize(N: Inputs.size() / 2);
3842 }
3843
3844 assert(Inputs.front()->getType() == ToType);
3845 return Inputs.front();
3846}
3847
3848auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3849 Value *Ptr1) const
3850 -> std::optional<int> {
3851 // Try SCEV first.
3852 const SCEV *Scev0 = SE.getSCEV(V: Ptr0);
3853 const SCEV *Scev1 = SE.getSCEV(V: Ptr1);
3854 const SCEV *ScevDiff = SE.getMinusSCEV(LHS: Scev0, RHS: Scev1);
3855 if (auto *Const = dyn_cast<SCEVConstant>(Val: ScevDiff)) {
3856 APInt V = Const->getAPInt();
3857 if (V.isSignedIntN(N: 8 * sizeof(int)))
3858 return static_cast<int>(V.getSExtValue());
3859 }
3860
3861 struct Builder : IRBuilder<> {
3862 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3863 ~Builder() {
3864 for (Instruction *I : llvm::reverse(C&: ToErase))
3865 I->eraseFromParent();
3866 }
3867 SmallVector<Instruction *, 8> ToErase;
3868 };
3869
3870#define CallBuilder(B, F) \
3871 [&](auto &B_) { \
3872 Value *V = B_.F; \
3873 if (auto *I = dyn_cast<Instruction>(V)) \
3874 B_.ToErase.push_back(I); \
3875 return V; \
3876 }(B)
3877
3878 auto Simplify = [this](Value *V) {
3879 if (Value *S = simplify(V))
3880 return S;
3881 return V;
3882 };
3883
3884 auto StripBitCast = [](Value *V) {
3885 while (auto *C = dyn_cast<BitCastInst>(Val: V))
3886 V = C->getOperand(i_nocapture: 0);
3887 return V;
3888 };
3889
3890 Ptr0 = StripBitCast(Ptr0);
3891 Ptr1 = StripBitCast(Ptr1);
3892 if (!isa<GetElementPtrInst>(Val: Ptr0) || !isa<GetElementPtrInst>(Val: Ptr1))
3893 return std::nullopt;
3894
3895 auto *Gep0 = cast<GetElementPtrInst>(Val: Ptr0);
3896 auto *Gep1 = cast<GetElementPtrInst>(Val: Ptr1);
3897 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3898 return std::nullopt;
3899 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3900 return std::nullopt;
3901
3902 Builder B(Gep0->getParent());
3903 int Scale = getSizeOf(Ty: Gep0->getSourceElementType(), Kind: Alloc);
3904
3905 // FIXME: for now only check GEPs with a single index.
3906 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3907 return std::nullopt;
3908
3909 Value *Idx0 = Gep0->getOperand(i_nocapture: 1);
3910 Value *Idx1 = Gep1->getOperand(i_nocapture: 1);
3911
3912 // First, try to simplify the subtraction directly.
3913 if (auto *Diff = dyn_cast<ConstantInt>(
3914 Val: Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3915 return Diff->getSExtValue() * Scale;
3916
3917 KnownBits Known0 = getKnownBits(V: Idx0, CtxI: Gep0);
3918 KnownBits Known1 = getKnownBits(V: Idx1, CtxI: Gep1);
3919 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3920 if (Unknown.isAllOnes())
3921 return std::nullopt;
3922
3923 Value *MaskU = ConstantInt::get(Ty: Idx0->getType(), V: Unknown);
3924 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3925 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3926 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3927 int Diff0 = 0;
3928 if (auto *C = dyn_cast<ConstantInt>(Val: SubU)) {
3929 Diff0 = C->getSExtValue();
3930 } else {
3931 return std::nullopt;
3932 }
3933
3934 Value *MaskK = ConstantInt::get(Ty: MaskU->getType(), V: ~Unknown);
3935 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3936 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3937 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3938 int Diff1 = 0;
3939 if (auto *C = dyn_cast<ConstantInt>(Val: SubK)) {
3940 Diff1 = C->getSExtValue();
3941 } else {
3942 return std::nullopt;
3943 }
3944
3945 return (Diff0 + Diff1) * Scale;
3946
3947#undef CallBuilder
3948}
3949
3950auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
3951 const Instruction *CtxI) const
3952 -> unsigned {
3953 return ComputeMaxSignificantBits(Op: V, DL, AC: &AC, CxtI: CtxI, DT: &DT);
3954}
3955
3956auto HexagonVectorCombine::getKnownBits(const Value *V,
3957 const Instruction *CtxI) const
3958 -> KnownBits {
3959 return computeKnownBits(V, DL, AC: &AC, CxtI: CtxI, DT: &DT);
3960}
3961
3962auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
3963 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
3964 In.isFenceLike() || In.mayReadOrWriteMemory()) {
3965 return false;
3966 }
3967 if (isa<CallBase>(Val: In) || isa<AllocaInst>(Val: In))
3968 return false;
3969 return true;
3970}
3971
3972template <typename T>
3973auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
3974 BasicBlock::const_iterator To,
3975 const T &IgnoreInsts) const
3976 -> bool {
3977 auto getLocOrNone =
3978 [this](const Instruction &I) -> std::optional<MemoryLocation> {
3979 if (const auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3980 switch (II->getIntrinsicID()) {
3981 case Intrinsic::masked_load:
3982 return MemoryLocation::getForArgument(Call: II, ArgIdx: 0, TLI);
3983 case Intrinsic::masked_store:
3984 return MemoryLocation::getForArgument(Call: II, ArgIdx: 1, TLI);
3985 }
3986 }
3987 return MemoryLocation::getOrNone(Inst: &I);
3988 };
3989
3990 // The source and the destination must be in the same basic block.
3991 const BasicBlock &Block = *In.getParent();
3992 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
3993 // No PHIs.
3994 if (isa<PHINode>(Val: In) || (To != Block.end() && isa<PHINode>(Val: *To)))
3995 return false;
3996
3997 if (!mayHaveNonDefUseDependency(I: In))
3998 return true;
3999 bool MayWrite = In.mayWriteToMemory();
4000 auto MaybeLoc = getLocOrNone(In);
4001
4002 auto From = In.getIterator();
4003 if (From == To)
4004 return true;
4005 bool MoveUp = (To != Block.end() && To->comesBefore(Other: &In));
4006 auto Range =
4007 MoveUp ? std::make_pair(x&: To, y&: From) : std::make_pair(x: std::next(x: From), y&: To);
4008 for (auto It = Range.first; It != Range.second; ++It) {
4009 const Instruction &I = *It;
4010 if (llvm::is_contained(IgnoreInsts, &I))
4011 continue;
4012 // assume intrinsic can be ignored
4013 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
4014 if (II->getIntrinsicID() == Intrinsic::assume)
4015 continue;
4016 }
4017 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
4018 if (I.mayThrow())
4019 return false;
4020 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
4021 if (!CB->hasFnAttr(Kind: Attribute::WillReturn))
4022 return false;
4023 if (!CB->hasFnAttr(Kind: Attribute::NoSync))
4024 return false;
4025 }
4026 if (I.mayReadOrWriteMemory()) {
4027 auto MaybeLocI = getLocOrNone(I);
4028 if (MayWrite || I.mayWriteToMemory()) {
4029 if (!MaybeLoc || !MaybeLocI)
4030 return false;
4031 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
4032 return false;
4033 }
4034 }
4035 }
4036 return true;
4037}
4038
4039auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
4040 if (auto *VecTy = dyn_cast<VectorType>(Val: Ty))
4041 return VecTy->getElementType() == getByteTy();
4042 return false;
4043}
4044
4045auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
4046 Value *Hi, int Start,
4047 int Length) const -> Value * {
4048 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
4049 SmallVector<int, 128> SMask(Length);
4050 std::iota(first: SMask.begin(), last: SMask.end(), value: Start);
4051 return Builder.CreateShuffleVector(V1: Lo, V2: Hi, Mask: SMask, Name: "shf");
4052}
4053
4054// Pass management.
4055
4056namespace {
4057class HexagonVectorCombineLegacy : public FunctionPass {
4058public:
4059 static char ID;
4060
4061 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
4062
4063 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
4064
4065 void getAnalysisUsage(AnalysisUsage &AU) const override {
4066 AU.setPreservesCFG();
4067 AU.addRequired<AAResultsWrapperPass>();
4068 AU.addRequired<AssumptionCacheTracker>();
4069 AU.addRequired<DominatorTreeWrapperPass>();
4070 AU.addRequired<ScalarEvolutionWrapperPass>();
4071 AU.addRequired<TargetLibraryInfoWrapperPass>();
4072 AU.addRequired<TargetPassConfig>();
4073 FunctionPass::getAnalysisUsage(AU);
4074 }
4075
4076 bool runOnFunction(Function &F) override {
4077 if (skipFunction(F))
4078 return false;
4079 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
4080 AssumptionCache &AC =
4081 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4082 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4083 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4084 TargetLibraryInfo &TLI =
4085 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
4086 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
4087 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM);
4088 return HVC.run();
4089 }
4090};
4091} // namespace
4092
4093char HexagonVectorCombineLegacy::ID = 0;
4094
4095INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
4096 "Hexagon Vector Combine", false, false)
4097INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
4098INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4099INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
4100INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
4101INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
4102INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
4103INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
4104 "Hexagon Vector Combine", false, false)
4105
4106FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {
4107 return new HexagonVectorCombineLegacy();
4108}
4109