1//===- LoadStoreOpt.cpp ----------- Generic memory optimizations -*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the LoadStoreOpt optimization pass.
10//===----------------------------------------------------------------------===//
11
12#include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
13#include "llvm/ADT/STLExtras.h"
14#include "llvm/ADT/SmallPtrSet.h"
15#include "llvm/ADT/Statistic.h"
16#include "llvm/Analysis/AliasAnalysis.h"
17#include "llvm/Analysis/MemoryLocation.h"
18#include "llvm/Analysis/OptimizationRemarkEmitter.h"
19#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22#include "llvm/CodeGen/GlobalISel/Utils.h"
23#include "llvm/CodeGen/LowLevelTypeUtils.h"
24#include "llvm/CodeGen/MachineBasicBlock.h"
25#include "llvm/CodeGen/MachineFrameInfo.h"
26#include "llvm/CodeGen/MachineFunction.h"
27#include "llvm/CodeGen/MachineInstr.h"
28#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
29#include "llvm/CodeGen/MachineRegisterInfo.h"
30#include "llvm/CodeGen/Register.h"
31#include "llvm/CodeGen/TargetLowering.h"
32#include "llvm/CodeGen/TargetOpcodes.h"
33#include "llvm/InitializePasses.h"
34#include "llvm/Support/AtomicOrdering.h"
35#include "llvm/Support/Casting.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/ErrorHandling.h"
38#include <algorithm>
39
40#define DEBUG_TYPE "loadstore-opt"
41
42using namespace llvm;
43using namespace ore;
44using namespace MIPatternMatch;
45
46STATISTIC(NumStoresMerged, "Number of stores merged");
47
48const unsigned MaxStoreSizeToForm = 128;
49
50char LoadStoreOpt::ID = 0;
51INITIALIZE_PASS_BEGIN(LoadStoreOpt, DEBUG_TYPE, "Generic memory optimizations",
52 false, false)
53INITIALIZE_PASS_END(LoadStoreOpt, DEBUG_TYPE, "Generic memory optimizations",
54 false, false)
55
56LoadStoreOpt::LoadStoreOpt(std::function<bool(const MachineFunction &)> F)
57 : MachineFunctionPass(ID), DoNotRunPass(F) {}
58
59LoadStoreOpt::LoadStoreOpt()
60 : LoadStoreOpt([](const MachineFunction &) { return false; }) {}
61
62void LoadStoreOpt::init(MachineFunction &MF) {
63 this->MF = &MF;
64 MRI = &MF.getRegInfo();
65 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
66 TLI = MF.getSubtarget().getTargetLowering();
67 LI = MF.getSubtarget().getLegalizerInfo();
68 Builder.setMF(MF);
69 IsPreLegalizer = !MF.getProperties().hasLegalized();
70 InstsToErase.clear();
71}
72
73void LoadStoreOpt::getAnalysisUsage(AnalysisUsage &AU) const {
74 AU.addRequired<AAResultsWrapperPass>();
75 AU.setPreservesAll();
76 getSelectionDAGFallbackAnalysisUsage(AU);
77 MachineFunctionPass::getAnalysisUsage(AU);
78}
79
80BaseIndexOffset GISelAddressing::getPointerInfo(Register Ptr,
81 MachineRegisterInfo &MRI) {
82 BaseIndexOffset Info;
83 Register PtrAddRHS;
84 Register BaseReg;
85 if (!mi_match(R: Ptr, MRI, P: m_GPtrAdd(L: m_Reg(R&: BaseReg), R: m_Reg(R&: PtrAddRHS)))) {
86 Info.setBase(Ptr);
87 Info.setOffset(0);
88 return Info;
89 }
90 Info.setBase(BaseReg);
91 auto RHSCst = getIConstantVRegValWithLookThrough(VReg: PtrAddRHS, MRI);
92 if (RHSCst)
93 Info.setOffset(RHSCst->Value.getSExtValue());
94
95 // Just recognize a simple case for now. In future we'll need to match
96 // indexing patterns for base + index + constant.
97 Info.setIndex(PtrAddRHS);
98 return Info;
99}
100
101bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
102 const MachineInstr &MI2,
103 bool &IsAlias,
104 MachineRegisterInfo &MRI) {
105 auto *LdSt1 = dyn_cast<GLoadStore>(Val: &MI1);
106 auto *LdSt2 = dyn_cast<GLoadStore>(Val: &MI2);
107 if (!LdSt1 || !LdSt2)
108 return false;
109
110 BaseIndexOffset BasePtr0 = getPointerInfo(Ptr: LdSt1->getPointerReg(), MRI);
111 BaseIndexOffset BasePtr1 = getPointerInfo(Ptr: LdSt2->getPointerReg(), MRI);
112
113 if (!BasePtr0.getBase().isValid() || !BasePtr1.getBase().isValid())
114 return false;
115
116 LocationSize Size1 = LdSt1->getMemSize();
117 LocationSize Size2 = LdSt2->getMemSize();
118
119 int64_t PtrDiff;
120 if (BasePtr0.getBase() == BasePtr1.getBase() && BasePtr0.hasValidOffset() &&
121 BasePtr1.hasValidOffset()) {
122 PtrDiff = BasePtr1.getOffset() - BasePtr0.getOffset();
123 // If the size of memory access is unknown, do not use it to do analysis.
124 // One example of unknown size memory access is to load/store scalable
125 // vector objects on the stack.
126 // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
127 // following situations arise:
128 if (PtrDiff >= 0 && Size1.hasValue() && !Size1.isScalable()) {
129 // [----BasePtr0----]
130 // [---BasePtr1--]
131 // ========PtrDiff========>
132 IsAlias = !((int64_t)Size1.getValue() <= PtrDiff);
133 return true;
134 }
135 if (PtrDiff < 0 && Size2.hasValue() && !Size2.isScalable()) {
136 // [----BasePtr0----]
137 // [---BasePtr1--]
138 // =====(-PtrDiff)====>
139 IsAlias = !((PtrDiff + (int64_t)Size2.getValue()) <= 0);
140 return true;
141 }
142 return false;
143 }
144
145 // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
146 // able to calculate their relative offset if at least one arises
147 // from an alloca. However, these allocas cannot overlap and we
148 // can infer there is no alias.
149 auto *Base0Def = getDefIgnoringCopies(Reg: BasePtr0.getBase(), MRI);
150 auto *Base1Def = getDefIgnoringCopies(Reg: BasePtr1.getBase(), MRI);
151 if (!Base0Def || !Base1Def)
152 return false; // Couldn't tell anything.
153
154
155 if (Base0Def->getOpcode() != Base1Def->getOpcode())
156 return false;
157
158 if (Base0Def->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
159 MachineFrameInfo &MFI = Base0Def->getMF()->getFrameInfo();
160 // If the bases have the same frame index but we couldn't find a
161 // constant offset, (indices are different) be conservative.
162 if (Base0Def != Base1Def &&
163 (!MFI.isFixedObjectIndex(ObjectIdx: Base0Def->getOperand(i: 1).getIndex()) ||
164 !MFI.isFixedObjectIndex(ObjectIdx: Base1Def->getOperand(i: 1).getIndex()))) {
165 IsAlias = false;
166 return true;
167 }
168 }
169
170 // This implementation is a lot more primitive than the SDAG one for now.
171 // FIXME: what about constant pools?
172 if (Base0Def->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
173 auto GV0 = Base0Def->getOperand(i: 1).getGlobal();
174 auto GV1 = Base1Def->getOperand(i: 1).getGlobal();
175 if (GV0 != GV1) {
176 IsAlias = false;
177 return true;
178 }
179 }
180
181 // Can't tell anything about aliasing.
182 return false;
183}
184
185bool GISelAddressing::instMayAlias(const MachineInstr &MI,
186 const MachineInstr &Other,
187 MachineRegisterInfo &MRI,
188 AliasAnalysis *AA) {
189 struct MemUseCharacteristics {
190 bool IsVolatile;
191 bool IsAtomic;
192 Register BasePtr;
193 int64_t Offset;
194 LocationSize NumBytes;
195 MachineMemOperand *MMO;
196 };
197
198 auto getCharacteristics =
199 [&](const MachineInstr *MI) -> MemUseCharacteristics {
200 if (const auto *LS = dyn_cast<GLoadStore>(Val: MI)) {
201 Register BaseReg;
202 int64_t Offset = 0;
203 // No pre/post-inc addressing modes are considered here, unlike in SDAG.
204 if (!mi_match(R: LS->getPointerReg(), MRI,
205 P: m_GPtrAdd(L: m_Reg(R&: BaseReg), R: m_ICst(Cst&: Offset)))) {
206 BaseReg = LS->getPointerReg();
207 Offset = 0;
208 }
209
210 LocationSize Size = LS->getMMO().getSize();
211 return {.IsVolatile: LS->isVolatile(), .IsAtomic: LS->isAtomic(), .BasePtr: BaseReg,
212 .Offset: Offset /*base offset*/, .NumBytes: Size, .MMO: &LS->getMMO()};
213 }
214 // FIXME: support recognizing lifetime instructions.
215 // Default.
216 return {.IsVolatile: false /*isvolatile*/,
217 /*isAtomic*/ .IsAtomic: false,
218 .BasePtr: Register(),
219 .Offset: (int64_t)0 /*offset*/,
220 .NumBytes: LocationSize::beforeOrAfterPointer() /*size*/,
221 .MMO: (MachineMemOperand *)nullptr};
222 };
223 MemUseCharacteristics MUC0 = getCharacteristics(&MI),
224 MUC1 = getCharacteristics(&Other);
225
226 // If they are to the same address, then they must be aliases.
227 if (MUC0.BasePtr.isValid() && MUC0.BasePtr == MUC1.BasePtr &&
228 MUC0.Offset == MUC1.Offset)
229 return true;
230
231 // If they are both volatile then they cannot be reordered.
232 if (MUC0.IsVolatile && MUC1.IsVolatile)
233 return true;
234
235 // Be conservative about atomics for the moment
236 // TODO: This is way overconservative for unordered atomics (see D66309)
237 if (MUC0.IsAtomic && MUC1.IsAtomic)
238 return true;
239
240 // If one operation reads from invariant memory, and the other may store, they
241 // cannot alias.
242 if (MUC0.MMO && MUC1.MMO) {
243 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
244 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
245 return false;
246 }
247
248 // If NumBytes is scalable and offset is not 0, conservatively return may
249 // alias
250 if ((MUC0.NumBytes.isScalable() && MUC0.Offset != 0) ||
251 (MUC1.NumBytes.isScalable() && MUC1.Offset != 0))
252 return true;
253
254 const bool BothNotScalable =
255 !MUC0.NumBytes.isScalable() && !MUC1.NumBytes.isScalable();
256
257 // Try to prove that there is aliasing, or that there is no aliasing. Either
258 // way, we can return now. If nothing can be proved, proceed with more tests.
259 bool IsAlias;
260 if (BothNotScalable &&
261 GISelAddressing::aliasIsKnownForLoadStore(MI1: MI, MI2: Other, IsAlias, MRI))
262 return IsAlias;
263
264 // The following all rely on MMO0 and MMO1 being valid.
265 if (!MUC0.MMO || !MUC1.MMO)
266 return true;
267
268 // FIXME: port the alignment based alias analysis from SDAG's isAlias().
269 int64_t SrcValOffset0 = MUC0.MMO->getOffset();
270 int64_t SrcValOffset1 = MUC1.MMO->getOffset();
271 LocationSize Size0 = MUC0.NumBytes;
272 LocationSize Size1 = MUC1.NumBytes;
273 if (AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0.hasValue() &&
274 Size1.hasValue()) {
275 // Use alias analysis information.
276 int64_t MinOffset = std::min(a: SrcValOffset0, b: SrcValOffset1);
277 int64_t Overlap0 =
278 Size0.getValue().getKnownMinValue() + SrcValOffset0 - MinOffset;
279 int64_t Overlap1 =
280 Size1.getValue().getKnownMinValue() + SrcValOffset1 - MinOffset;
281 LocationSize Loc0 =
282 Size0.isScalable() ? Size0 : LocationSize::precise(Value: Overlap0);
283 LocationSize Loc1 =
284 Size1.isScalable() ? Size1 : LocationSize::precise(Value: Overlap1);
285
286 if (AA->isNoAlias(
287 LocA: MemoryLocation(MUC0.MMO->getValue(), Loc0, MUC0.MMO->getAAInfo()),
288 LocB: MemoryLocation(MUC1.MMO->getValue(), Loc1, MUC1.MMO->getAAInfo())))
289 return false;
290 }
291
292 // Otherwise we have to assume they alias.
293 return true;
294}
295
296/// Returns true if the instruction creates an unavoidable hazard that
297/// forces a boundary between store merge candidates.
298static bool isInstHardMergeHazard(MachineInstr &MI) {
299 return MI.hasUnmodeledSideEffects() || MI.hasOrderedMemoryRef();
300}
301
302bool LoadStoreOpt::mergeStores(SmallVectorImpl<GStore *> &StoresToMerge) {
303 // Try to merge all the stores in the vector, splitting into separate segments
304 // as necessary.
305 assert(StoresToMerge.size() > 1 && "Expected multiple stores to merge");
306 LLT OrigTy = MRI->getType(Reg: StoresToMerge[0]->getValueReg());
307 LLT PtrTy = MRI->getType(Reg: StoresToMerge[0]->getPointerReg());
308 unsigned AS = PtrTy.getAddressSpace();
309 // Ensure the legal store info is computed for this address space.
310 initializeStoreMergeTargetInfo(AddrSpace: AS);
311 const auto &LegalSizes = LegalStoreSizes[AS];
312
313#ifndef NDEBUG
314 for (auto *StoreMI : StoresToMerge)
315 assert(MRI->getType(StoreMI->getValueReg()) == OrigTy);
316#endif
317
318 bool AnyMerged = false;
319 do {
320 unsigned NumPow2 = llvm::bit_floor(Value: StoresToMerge.size());
321 unsigned MaxSizeBits = NumPow2 * OrigTy.getSizeInBits().getFixedValue();
322 // Compute the biggest store we can generate to handle the number of stores.
323 unsigned MergeSizeBits;
324 for (MergeSizeBits = MaxSizeBits; MergeSizeBits > 1; MergeSizeBits /= 2) {
325 LLT StoreTy = LLT::scalar(SizeInBits: MergeSizeBits);
326 EVT StoreEVT =
327 getApproximateEVTForLLT(Ty: StoreTy, Ctx&: MF->getFunction().getContext());
328 if (LegalSizes.size() > MergeSizeBits && LegalSizes[MergeSizeBits] &&
329 TLI->canMergeStoresTo(AS, MemVT: StoreEVT, MF: *MF) &&
330 (TLI->isTypeLegal(VT: StoreEVT)))
331 break; // We can generate a MergeSize bits store.
332 }
333 if (MergeSizeBits <= OrigTy.getSizeInBits())
334 return AnyMerged; // No greater merge.
335
336 unsigned NumStoresToMerge = MergeSizeBits / OrigTy.getSizeInBits();
337 // Perform the actual merging.
338 SmallVector<GStore *, 8> SingleMergeStores(
339 StoresToMerge.begin(), StoresToMerge.begin() + NumStoresToMerge);
340 AnyMerged |= doSingleStoreMerge(Stores&: SingleMergeStores);
341 StoresToMerge.erase(CS: StoresToMerge.begin(),
342 CE: StoresToMerge.begin() + NumStoresToMerge);
343 } while (StoresToMerge.size() > 1);
344 return AnyMerged;
345}
346
347bool LoadStoreOpt::isLegalOrBeforeLegalizer(const LegalityQuery &Query,
348 MachineFunction &MF) const {
349 auto Action = LI->getAction(Query).Action;
350 // If the instruction is unsupported, it can't be legalized at all.
351 if (Action == LegalizeActions::Unsupported)
352 return false;
353 return IsPreLegalizer || Action == LegalizeAction::Legal;
354}
355
356bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
357 assert(Stores.size() > 1);
358 // We know that all the stores are consecutive and there are no aliasing
359 // operations in the range. However, the values that are being stored may be
360 // generated anywhere before each store. To ensure we have the values
361 // available, we materialize the wide value and new store at the place of the
362 // final store in the merge sequence.
363 GStore *FirstStore = Stores[0];
364 const unsigned NumStores = Stores.size();
365 LLT SmallTy = MRI->getType(Reg: FirstStore->getValueReg());
366 LLT WideValueTy =
367 LLT::scalar(SizeInBits: NumStores * SmallTy.getSizeInBits().getFixedValue());
368
369 // For each store, compute pairwise merged debug locs.
370 DebugLoc MergedLoc = Stores.front()->getDebugLoc();
371 for (auto *Store : drop_begin(RangeOrContainer&: Stores))
372 MergedLoc = DebugLoc::getMergedLocation(LocA: MergedLoc, LocB: Store->getDebugLoc());
373
374 Builder.setInstr(*Stores.back());
375 Builder.setDebugLoc(MergedLoc);
376
377 // If all of the store values are constants, then create a wide constant
378 // directly. Otherwise, we need to generate some instructions to merge the
379 // existing values together into a wider type.
380 SmallVector<APInt, 8> ConstantVals;
381 for (auto *Store : Stores) {
382 auto MaybeCst =
383 getIConstantVRegValWithLookThrough(VReg: Store->getValueReg(), MRI: *MRI);
384 if (!MaybeCst) {
385 ConstantVals.clear();
386 break;
387 }
388 ConstantVals.emplace_back(Args&: MaybeCst->Value);
389 }
390
391 Register WideReg;
392 auto *WideMMO =
393 MF->getMachineMemOperand(MMO: &FirstStore->getMMO(), Offset: 0, Ty: WideValueTy);
394 if (ConstantVals.empty()) {
395 // Mimic the SDAG behaviour here and don't try to do anything for unknown
396 // values. In future, we should also support the cases of loads and
397 // extracted vector elements.
398 return false;
399 }
400
401 assert(ConstantVals.size() == NumStores);
402 // Check if our wide constant is legal.
403 if (!isLegalOrBeforeLegalizer(Query: {TargetOpcode::G_CONSTANT, {WideValueTy}}, MF&: *MF))
404 return false;
405 APInt WideConst(WideValueTy.getSizeInBits(), 0);
406 for (unsigned Idx = 0; Idx < ConstantVals.size(); ++Idx) {
407 // Insert the smaller constant into the corresponding position in the
408 // wider one.
409 WideConst.insertBits(SubBits: ConstantVals[Idx], bitPosition: Idx * SmallTy.getSizeInBits());
410 }
411 WideReg = Builder.buildConstant(Res: WideValueTy, Val: WideConst).getReg(Idx: 0);
412 auto NewStore =
413 Builder.buildStore(Val: WideReg, Addr: FirstStore->getPointerReg(), MMO&: *WideMMO);
414 (void) NewStore;
415 LLVM_DEBUG(dbgs() << "Merged " << Stores.size()
416 << " stores into merged store: " << *NewStore);
417 LLVM_DEBUG(for (auto *MI : Stores) dbgs() << " " << *MI;);
418 NumStoresMerged += Stores.size();
419
420 MachineOptimizationRemarkEmitter MORE(*MF, nullptr);
421 MORE.emit(RemarkBuilder: [&]() {
422 MachineOptimizationRemark R(DEBUG_TYPE, "MergedStore",
423 FirstStore->getDebugLoc(),
424 FirstStore->getParent());
425 R << "Merged " << NV("NumMerged", Stores.size()) << " stores of "
426 << NV("OrigWidth", SmallTy.getSizeInBytes())
427 << " bytes into a single store of "
428 << NV("NewWidth", WideValueTy.getSizeInBytes()) << " bytes";
429 return R;
430 });
431
432 InstsToErase.insert_range(R&: Stores);
433 return true;
434}
435
436bool LoadStoreOpt::processMergeCandidate(StoreMergeCandidate &C) {
437 if (C.Stores.size() < 2) {
438 C.reset();
439 return false;
440 }
441
442 LLVM_DEBUG(dbgs() << "Checking store merge candidate with " << C.Stores.size()
443 << " stores, starting with " << *C.Stores[0]);
444 // We know that the stores in the candidate are adjacent.
445 // Now we need to check if any potential aliasing instructions recorded
446 // during the search alias with load/stores added to the candidate after.
447 // For example, if we have the candidate:
448 // C.Stores = [ST1, ST2, ST3, ST4]
449 // and after seeing ST2 we saw a load LD1, which did not alias with ST1 or
450 // ST2, then we would have recorded it into the PotentialAliases structure
451 // with the associated index value of "1". Then we see ST3 and ST4 and add
452 // them to the candidate group. We know that LD1 does not alias with ST1 or
453 // ST2, since we already did that check. However we don't yet know if it
454 // may alias ST3 and ST4, so we perform those checks now.
455 SmallVector<GStore *> StoresToMerge;
456
457 auto DoesStoreAliasWithPotential = [&](unsigned Idx, GStore &CheckStore) {
458 for (auto AliasInfo : reverse(C&: C.PotentialAliases)) {
459 MachineInstr *PotentialAliasOp = AliasInfo.first;
460 unsigned PreCheckedIdx = AliasInfo.second;
461 if (Idx < PreCheckedIdx) {
462 // Once our store index is lower than the index associated with the
463 // potential alias, we know that we've already checked for this alias
464 // and all of the earlier potential aliases too.
465 return false;
466 }
467 // Need to check this alias.
468 if (GISelAddressing::instMayAlias(MI: CheckStore, Other: *PotentialAliasOp, MRI&: *MRI,
469 AA)) {
470 LLVM_DEBUG(dbgs() << "Potential alias " << *PotentialAliasOp
471 << " detected\n");
472 return true;
473 }
474 }
475 return false;
476 };
477 // Start from the last store in the group, and check if it aliases with any
478 // of the potential aliasing operations in the list.
479 for (int StoreIdx = C.Stores.size() - 1; StoreIdx >= 0; --StoreIdx) {
480 auto *CheckStore = C.Stores[StoreIdx];
481 if (DoesStoreAliasWithPotential(StoreIdx, *CheckStore))
482 continue;
483 StoresToMerge.emplace_back(Args&: CheckStore);
484 }
485
486 LLVM_DEBUG(dbgs() << StoresToMerge.size()
487 << " stores remaining after alias checks. Merging...\n");
488
489 // Now we've checked for aliasing hazards, merge any stores left.
490 C.reset();
491 if (StoresToMerge.size() < 2)
492 return false;
493 return mergeStores(StoresToMerge);
494}
495
496bool LoadStoreOpt::operationAliasesWithCandidate(MachineInstr &MI,
497 StoreMergeCandidate &C) {
498 if (C.Stores.empty())
499 return false;
500 return llvm::any_of(Range&: C.Stores, P: [&](MachineInstr *OtherMI) {
501 return instMayAlias(MI, Other: *OtherMI, MRI&: *MRI, AA);
502 });
503}
504
505void LoadStoreOpt::StoreMergeCandidate::addPotentialAlias(MachineInstr &MI) {
506 PotentialAliases.emplace_back(Args: std::make_pair(x: &MI, y: Stores.size() - 1));
507}
508
509bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
510 StoreMergeCandidate &C) {
511 // Check if the given store writes to an adjacent address, and other
512 // requirements.
513 LLT ValueTy = MRI->getType(Reg: StoreMI.getValueReg());
514 LLT PtrTy = MRI->getType(Reg: StoreMI.getPointerReg());
515
516 // Only handle scalars.
517 if (!ValueTy.isScalar())
518 return false;
519
520 // Don't allow truncating stores for now.
521 if (StoreMI.getMemSizeInBits() != ValueTy.getSizeInBits())
522 return false;
523
524 // Avoid adding volatile or ordered stores to the candidate. We already have a
525 // check for this in instMayAlias() but that only get's called later between
526 // potential aliasing hazards.
527 if (!StoreMI.isSimple())
528 return false;
529
530 Register StoreAddr = StoreMI.getPointerReg();
531 auto BIO = getPointerInfo(Ptr: StoreAddr, MRI&: *MRI);
532 Register StoreBase = BIO.getBase();
533 if (C.Stores.empty()) {
534 C.BasePtr = StoreBase;
535 if (!BIO.hasValidOffset()) {
536 C.CurrentLowestOffset = 0;
537 } else {
538 C.CurrentLowestOffset = BIO.getOffset();
539 }
540 // This is the first store of the candidate.
541 // If the offset can't possibly allow for a lower addressed store with the
542 // same base, don't bother adding it.
543 if (BIO.hasValidOffset() &&
544 BIO.getOffset() < static_cast<int64_t>(ValueTy.getSizeInBytes()))
545 return false;
546 C.Stores.emplace_back(Args: &StoreMI);
547 LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: "
548 << StoreMI);
549 return true;
550 }
551
552 // Check the store is the same size as the existing ones in the candidate.
553 if (MRI->getType(Reg: C.Stores[0]->getValueReg()).getSizeInBits() !=
554 ValueTy.getSizeInBits())
555 return false;
556
557 if (MRI->getType(Reg: C.Stores[0]->getPointerReg()).getAddressSpace() !=
558 PtrTy.getAddressSpace())
559 return false;
560
561 // There are other stores in the candidate. Check that the store address
562 // writes to the next lowest adjacent address.
563 if (C.BasePtr != StoreBase)
564 return false;
565 // If we don't have a valid offset, we can't guarantee to be an adjacent
566 // offset.
567 if (!BIO.hasValidOffset())
568 return false;
569 if ((C.CurrentLowestOffset -
570 static_cast<int64_t>(ValueTy.getSizeInBytes())) != BIO.getOffset())
571 return false;
572
573 // This writes to an adjacent address. Allow it.
574 C.Stores.emplace_back(Args: &StoreMI);
575 C.CurrentLowestOffset = C.CurrentLowestOffset - ValueTy.getSizeInBytes();
576 LLVM_DEBUG(dbgs() << "Candidate added store: " << StoreMI);
577 return true;
578}
579
580bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock &MBB) {
581 bool Changed = false;
582 // Walk through the block bottom-up, looking for merging candidates.
583 StoreMergeCandidate Candidate;
584 for (MachineInstr &MI : llvm::reverse(C&: MBB)) {
585 if (InstsToErase.contains(Ptr: &MI))
586 continue;
587
588 if (auto *StoreMI = dyn_cast<GStore>(Val: &MI)) {
589 // We have a G_STORE. Add it to the candidate if it writes to an adjacent
590 // address.
591 if (!addStoreToCandidate(StoreMI&: *StoreMI, C&: Candidate)) {
592 // Store wasn't eligible to be added. May need to record it as a
593 // potential alias.
594 if (operationAliasesWithCandidate(MI&: *StoreMI, C&: Candidate)) {
595 Changed |= processMergeCandidate(C&: Candidate);
596 continue;
597 }
598 Candidate.addPotentialAlias(MI&: *StoreMI);
599 }
600 continue;
601 }
602
603 // If we don't have any stores yet, this instruction can't pose a problem.
604 if (Candidate.Stores.empty())
605 continue;
606
607 // We're dealing with some other kind of instruction.
608 if (isInstHardMergeHazard(MI)) {
609 Changed |= processMergeCandidate(C&: Candidate);
610 Candidate.Stores.clear();
611 continue;
612 }
613
614 if (!MI.mayLoadOrStore())
615 continue;
616
617 if (operationAliasesWithCandidate(MI, C&: Candidate)) {
618 // We have a potential alias, so process the current candidate if we can
619 // and then continue looking for a new candidate.
620 Changed |= processMergeCandidate(C&: Candidate);
621 continue;
622 }
623
624 // Record this instruction as a potential alias for future stores that are
625 // added to the candidate.
626 Candidate.addPotentialAlias(MI);
627 }
628
629 // Process any candidate left after finishing searching the entire block.
630 Changed |= processMergeCandidate(C&: Candidate);
631
632 // Erase instructions now that we're no longer iterating over the block.
633 for (auto *MI : InstsToErase)
634 MI->eraseFromParent();
635 InstsToErase.clear();
636 return Changed;
637}
638
639/// Check if the store \p Store is a truncstore that can be merged. That is,
640/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
641/// Register then it does not need to match and SrcVal is set to the source
642/// value found.
643/// On match, returns the start byte offset of the \p SrcVal that is being
644/// stored.
645static std::optional<int64_t>
646getTruncStoreByteOffset(GStore &Store, Register &SrcVal,
647 MachineRegisterInfo &MRI) {
648 Register TruncVal;
649 if (!mi_match(R: Store.getValueReg(), MRI, P: m_GTrunc(Src: m_Reg(R&: TruncVal))))
650 return std::nullopt;
651
652 // The shift amount must be a constant multiple of the narrow type.
653 // It is translated to the offset address in the wide source value "y".
654 //
655 // x = G_LSHR y, ShiftAmtC
656 // s8 z = G_TRUNC x
657 // store z, ...
658 Register FoundSrcVal;
659 int64_t ShiftAmt;
660 if (!mi_match(R: TruncVal, MRI,
661 P: m_any_of(preds: m_GLShr(L: m_Reg(R&: FoundSrcVal), R: m_ICst(Cst&: ShiftAmt)),
662 preds: m_GAShr(L: m_Reg(R&: FoundSrcVal), R: m_ICst(Cst&: ShiftAmt))))) {
663 if (!SrcVal.isValid() || TruncVal == SrcVal) {
664 if (!SrcVal.isValid())
665 SrcVal = TruncVal;
666 return 0; // If it's the lowest index store.
667 }
668 return std::nullopt;
669 }
670
671 unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits();
672 if (ShiftAmt % NarrowBits != 0)
673 return std::nullopt;
674 const unsigned Offset = ShiftAmt / NarrowBits;
675
676 if (SrcVal.isValid() && FoundSrcVal != SrcVal)
677 return std::nullopt;
678
679 if (!SrcVal.isValid())
680 SrcVal = FoundSrcVal;
681 else if (MRI.getType(Reg: SrcVal) != MRI.getType(Reg: FoundSrcVal))
682 return std::nullopt;
683 return Offset;
684}
685
686/// Match a pattern where a wide type scalar value is stored by several narrow
687/// stores. Fold it into a single store or a BSWAP and a store if the targets
688/// supports it.
689///
690/// Assuming little endian target:
691/// i8 *p = ...
692/// i32 val = ...
693/// p[0] = (val >> 0) & 0xFF;
694/// p[1] = (val >> 8) & 0xFF;
695/// p[2] = (val >> 16) & 0xFF;
696/// p[3] = (val >> 24) & 0xFF;
697/// =>
698/// *((i32)p) = val;
699///
700/// i8 *p = ...
701/// i32 val = ...
702/// p[0] = (val >> 24) & 0xFF;
703/// p[1] = (val >> 16) & 0xFF;
704/// p[2] = (val >> 8) & 0xFF;
705/// p[3] = (val >> 0) & 0xFF;
706/// =>
707/// *((i32)p) = BSWAP(val);
708bool LoadStoreOpt::mergeTruncStore(GStore &StoreMI,
709 SmallPtrSetImpl<GStore *> &DeletedStores) {
710 LLT MemTy = StoreMI.getMMO().getMemoryType();
711
712 // We only handle merging simple stores of 1-4 bytes.
713 if (!MemTy.isScalar())
714 return false;
715 switch (MemTy.getSizeInBits()) {
716 case 8:
717 case 16:
718 case 32:
719 break;
720 default:
721 return false;
722 }
723 if (!StoreMI.isSimple())
724 return false;
725
726 // We do a simple search for mergeable stores prior to this one.
727 // Any potential alias hazard along the way terminates the search.
728 SmallVector<GStore *> FoundStores;
729
730 // We're looking for:
731 // 1) a (store(trunc(...)))
732 // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
733 // the partial value stored.
734 // 3) where the offsets form either a little or big-endian sequence.
735
736 auto &LastStore = StoreMI;
737
738 // The single base pointer that all stores must use.
739 Register BaseReg;
740 int64_t LastOffset;
741 if (!mi_match(R: LastStore.getPointerReg(), MRI: *MRI,
742 P: m_GPtrAdd(L: m_Reg(R&: BaseReg), R: m_ICst(Cst&: LastOffset)))) {
743 BaseReg = LastStore.getPointerReg();
744 LastOffset = 0;
745 }
746
747 GStore *LowestIdxStore = &LastStore;
748 int64_t LowestIdxOffset = LastOffset;
749
750 Register WideSrcVal;
751 auto LowestShiftAmt = getTruncStoreByteOffset(Store&: LastStore, SrcVal&: WideSrcVal, MRI&: *MRI);
752 if (!LowestShiftAmt)
753 return false; // Didn't match a trunc.
754 assert(WideSrcVal.isValid());
755
756 LLT WideStoreTy = MRI->getType(Reg: WideSrcVal);
757 // The wide type might not be a multiple of the memory type, e.g. s48 and s32.
758 if (WideStoreTy.getSizeInBits() % MemTy.getSizeInBits() != 0)
759 return false;
760 const unsigned NumStoresRequired =
761 WideStoreTy.getSizeInBits() / MemTy.getSizeInBits();
762
763 SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX);
764 OffsetMap[*LowestShiftAmt] = LastOffset;
765 FoundStores.emplace_back(Args: &LastStore);
766
767 const int MaxInstsToCheck = 10;
768 int NumInstsChecked = 0;
769 for (auto II = ++LastStore.getReverseIterator();
770 II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck;
771 ++II) {
772 NumInstsChecked++;
773 GStore *NewStore;
774 if ((NewStore = dyn_cast<GStore>(Val: &*II))) {
775 if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple())
776 break;
777 } else if (II->isLoadFoldBarrier() || II->mayLoad()) {
778 break;
779 } else {
780 continue; // This is a safe instruction we can look past.
781 }
782
783 Register NewBaseReg;
784 int64_t MemOffset;
785 // Check we're storing to the same base + some offset.
786 if (!mi_match(R: NewStore->getPointerReg(), MRI: *MRI,
787 P: m_GPtrAdd(L: m_Reg(R&: NewBaseReg), R: m_ICst(Cst&: MemOffset)))) {
788 NewBaseReg = NewStore->getPointerReg();
789 MemOffset = 0;
790 }
791 if (BaseReg != NewBaseReg)
792 break;
793
794 auto ShiftByteOffset = getTruncStoreByteOffset(Store&: *NewStore, SrcVal&: WideSrcVal, MRI&: *MRI);
795 if (!ShiftByteOffset)
796 break;
797 if (MemOffset < LowestIdxOffset) {
798 LowestIdxOffset = MemOffset;
799 LowestIdxStore = NewStore;
800 }
801
802 // Map the offset in the store and the offset in the combined value, and
803 // early return if it has been set before.
804 if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired ||
805 OffsetMap[*ShiftByteOffset] != INT64_MAX)
806 break;
807 OffsetMap[*ShiftByteOffset] = MemOffset;
808
809 FoundStores.emplace_back(Args&: NewStore);
810 // Reset counter since we've found a matching inst.
811 NumInstsChecked = 0;
812 if (FoundStores.size() == NumStoresRequired)
813 break;
814 }
815
816 if (FoundStores.size() != NumStoresRequired) {
817 if (FoundStores.size() == 1)
818 return false;
819 // We didn't find enough stores to merge into the size of the original
820 // source value, but we may be able to generate a smaller store if we
821 // truncate the source value.
822 WideStoreTy = LLT::scalar(SizeInBits: FoundStores.size() * MemTy.getScalarSizeInBits());
823 }
824
825 unsigned NumStoresFound = FoundStores.size();
826
827 const auto &DL = LastStore.getMF()->getDataLayout();
828 auto &C = LastStore.getMF()->getFunction().getContext();
829 // Check that a store of the wide type is both allowed and fast on the target
830 unsigned Fast = 0;
831 bool Allowed = TLI->allowsMemoryAccess(
832 Context&: C, DL, Ty: WideStoreTy, MMO: LowestIdxStore->getMMO(), Fast: &Fast);
833 if (!Allowed || !Fast)
834 return false;
835
836 // Check if the pieces of the value are going to the expected places in memory
837 // to merge the stores.
838 unsigned NarrowBits = MemTy.getScalarSizeInBits();
839 auto checkOffsets = [&](bool MatchLittleEndian) {
840 if (MatchLittleEndian) {
841 for (unsigned i = 0; i != NumStoresFound; ++i)
842 if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset)
843 return false;
844 } else { // MatchBigEndian by reversing loop counter.
845 for (unsigned i = 0, j = NumStoresFound - 1; i != NumStoresFound;
846 ++i, --j)
847 if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset)
848 return false;
849 }
850 return true;
851 };
852
853 // Check if the offsets line up for the native data layout of this target.
854 bool NeedBswap = false;
855 bool NeedRotate = false;
856 if (!checkOffsets(DL.isLittleEndian())) {
857 // Special-case: check if byte offsets line up for the opposite endian.
858 if (NarrowBits == 8 && checkOffsets(DL.isBigEndian()))
859 NeedBswap = true;
860 else if (NumStoresFound == 2 && checkOffsets(DL.isBigEndian()))
861 NeedRotate = true;
862 else
863 return false;
864 }
865
866 if (NeedBswap &&
867 !isLegalOrBeforeLegalizer(Query: {TargetOpcode::G_BSWAP, {WideStoreTy}}, MF&: *MF))
868 return false;
869 if (NeedRotate &&
870 !isLegalOrBeforeLegalizer(
871 Query: {TargetOpcode::G_ROTR, {WideStoreTy, WideStoreTy}}, MF&: *MF))
872 return false;
873
874 Builder.setInstrAndDebugLoc(StoreMI);
875
876 if (WideStoreTy != MRI->getType(Reg: WideSrcVal))
877 WideSrcVal = Builder.buildTrunc(Res: WideStoreTy, Op: WideSrcVal).getReg(Idx: 0);
878
879 if (NeedBswap) {
880 WideSrcVal = Builder.buildBSwap(Dst: WideStoreTy, Src0: WideSrcVal).getReg(Idx: 0);
881 } else if (NeedRotate) {
882 assert(WideStoreTy.getSizeInBits() % 2 == 0 &&
883 "Unexpected type for rotate");
884 auto RotAmt =
885 Builder.buildConstant(Res: WideStoreTy, Val: WideStoreTy.getSizeInBits() / 2);
886 WideSrcVal =
887 Builder.buildRotateRight(Dst: WideStoreTy, Src: WideSrcVal, Amt: RotAmt).getReg(Idx: 0);
888 }
889
890 Builder.buildStore(Val: WideSrcVal, Addr: LowestIdxStore->getPointerReg(),
891 PtrInfo: LowestIdxStore->getMMO().getPointerInfo(),
892 Alignment: LowestIdxStore->getMMO().getAlign());
893
894 // Erase the old stores.
895 for (auto *ST : FoundStores) {
896 ST->eraseFromParent();
897 DeletedStores.insert(Ptr: ST);
898 }
899 return true;
900}
901
902bool LoadStoreOpt::mergeTruncStoresBlock(MachineBasicBlock &BB) {
903 bool Changed = false;
904 SmallVector<GStore *, 16> Stores;
905 SmallPtrSet<GStore *, 8> DeletedStores;
906 // Walk up the block so we can see the most eligible stores.
907 for (MachineInstr &MI : llvm::reverse(C&: BB))
908 if (auto *StoreMI = dyn_cast<GStore>(Val: &MI))
909 Stores.emplace_back(Args&: StoreMI);
910
911 for (auto *StoreMI : Stores) {
912 if (DeletedStores.count(Ptr: StoreMI))
913 continue;
914 if (mergeTruncStore(StoreMI&: *StoreMI, DeletedStores))
915 Changed = true;
916 }
917 return Changed;
918}
919
920bool LoadStoreOpt::mergeFunctionStores(MachineFunction &MF) {
921 bool Changed = false;
922 for (auto &BB : MF){
923 Changed |= mergeBlockStores(MBB&: BB);
924 Changed |= mergeTruncStoresBlock(BB);
925 }
926
927 // Erase all dead instructions left over by the merging.
928 if (Changed) {
929 for (auto &BB : MF) {
930 for (auto &I : make_early_inc_range(Range: reverse(C&: BB))) {
931 if (isTriviallyDead(MI: I, MRI: *MRI))
932 I.eraseFromParent();
933 }
934 }
935 }
936
937 return Changed;
938}
939
940void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) {
941 // Query the legalizer info to record what store types are legal.
942 // We record this because we don't want to bother trying to merge stores into
943 // illegal ones, which would just result in being split again.
944
945 if (LegalStoreSizes.count(Val: AddrSpace)) {
946 assert(LegalStoreSizes[AddrSpace].any());
947 return; // Already cached sizes for this address space.
948 }
949
950 // Need to reserve at least MaxStoreSizeToForm + 1 bits.
951 BitVector LegalSizes(MaxStoreSizeToForm * 2);
952 const auto &LI = *MF->getSubtarget().getLegalizerInfo();
953 const auto &DL = MF->getFunction().getDataLayout();
954 Type *IRPtrTy = PointerType::get(C&: MF->getFunction().getContext(), AddressSpace: AddrSpace);
955 LLT PtrTy = getLLTForType(Ty&: *IRPtrTy, DL);
956 // We assume that we're not going to be generating any stores wider than
957 // MaxStoreSizeToForm bits for now.
958 for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) {
959 LLT Ty = LLT::scalar(SizeInBits: Size);
960 SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
961 {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}});
962 SmallVector<LLT> StoreTys({Ty, PtrTy});
963 LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs);
964 LegalizeActionStep ActionStep = LI.getAction(Query: Q);
965 if (ActionStep.Action == LegalizeActions::Legal)
966 LegalSizes.set(Size);
967 }
968 assert(LegalSizes.any() && "Expected some store sizes to be legal!");
969 LegalStoreSizes[AddrSpace] = LegalSizes;
970}
971
972bool LoadStoreOpt::runOnMachineFunction(MachineFunction &MF) {
973 // If the ISel pipeline failed, do not bother running that pass.
974 if (MF.getProperties().hasFailedISel())
975 return false;
976
977 LLVM_DEBUG(dbgs() << "Begin memory optimizations for: " << MF.getName()
978 << '\n');
979
980 init(MF);
981 bool Changed = false;
982 Changed |= mergeFunctionStores(MF);
983
984 LegalStoreSizes.clear();
985 return Changed;
986}
987