1//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// If a load follows a store and reloads data that the store has written to
10// memory, Intel microarchitectures can in many cases forward the data directly
11// from the store to the load, This "store forwarding" saves cycles by enabling
12// the load to directly obtain the data instead of accessing the data from
13// cache or memory.
14// A "store forward block" occurs in cases that a store cannot be forwarded to
15// the load. The most typical case of store forward block on Intel Core
16// microarchitecture that a small store cannot be forwarded to a large load.
17// The estimated penalty for a store forward block is ~13 cycles.
18//
19// This pass tries to recognize and handle cases where "store forward block"
20// is created by the compiler when lowering memcpy calls to a sequence
21// of a load and a store.
22//
23// The pass currently only handles cases where memcpy is lowered to
24// XMM/YMM registers, it tries to break the memcpy into smaller copies.
25// breaking the memcpy should be possible since there is no atomicity
26// guarantee for loads and stores to XMM/YMM.
27//
28// It could be better for performance to solve the problem by loading
29// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30// to memory, but this will result in a more conservative optimization since it
31// requires we prove that all memory accesses between the blocking store and the
32// load must alias/don't alias before we can move the store, whereas the
33// transformation done here is correct regardless to other memory accesses.
34//===----------------------------------------------------------------------===//
35
36#include "X86.h"
37#include "X86InstrInfo.h"
38#include "X86Subtarget.h"
39#include "llvm/Analysis/AliasAnalysis.h"
40#include "llvm/CodeGen/MachineBasicBlock.h"
41#include "llvm/CodeGen/MachineFunction.h"
42#include "llvm/CodeGen/MachineFunctionPass.h"
43#include "llvm/CodeGen/MachineInstr.h"
44#include "llvm/CodeGen/MachineInstrBuilder.h"
45#include "llvm/CodeGen/MachineOperand.h"
46#include "llvm/CodeGen/MachineRegisterInfo.h"
47#include "llvm/IR/DebugLoc.h"
48#include "llvm/IR/Function.h"
49#include "llvm/InitializePasses.h"
50#include "llvm/MC/MCInstrDesc.h"
51
52using namespace llvm;
53
54#define DEBUG_TYPE "x86-avoid-sfb"
55
56static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
57 "x86-disable-avoid-SFB", cl::Hidden,
58 cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(Val: false));
59
60static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
61 "x86-sfb-inspection-limit",
62 cl::desc("X86: Number of instructions backward to "
63 "inspect for store forwarding blocks."),
64 cl::init(Val: 20), cl::Hidden);
65
66namespace {
67
68using DisplacementSizeMap = std::map<int64_t, unsigned>;
69
70class X86AvoidSFBImpl {
71public:
72 X86AvoidSFBImpl(AliasAnalysis *AA) : AA(AA) {};
73 bool runOnMachineFunction(MachineFunction &MF);
74
75private:
76 MachineRegisterInfo *MRI = nullptr;
77 const X86InstrInfo *TII = nullptr;
78 const X86RegisterInfo *TRI = nullptr;
79 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
80 BlockedLoadsStoresPairs;
81 SmallVector<MachineInstr *, 2> ForRemoval;
82 AliasAnalysis *AA = nullptr;
83
84 /// Returns couples of Load then Store to memory which look
85 /// like a memcpy.
86 void findPotentiallylBlockedCopies(MachineFunction &MF);
87 /// Break the memcpy's load and store into smaller copies
88 /// such that each memory load that was blocked by a smaller store
89 /// would now be copied separately.
90 void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
91 const DisplacementSizeMap &BlockingStoresDispSizeMap);
92 /// Break a copy of size Size to smaller copies.
93 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
94 MachineInstr *StoreInst, int64_t StDispImm, int64_t Offset);
95
96 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
97 MachineInstr *StoreInst, unsigned NStoreOpcode,
98 int64_t StoreDisp, unsigned Size, int64_t Offset);
99
100 bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
101
102 unsigned getRegSizeInBytes(MachineInstr *Inst);
103};
104
105class X86AvoidSFBLegacy : public MachineFunctionPass {
106public:
107 static char ID;
108 X86AvoidSFBLegacy() : MachineFunctionPass(ID) {}
109
110 StringRef getPassName() const override {
111 return "X86 Avoid Store Forwarding Blocks";
112 }
113
114 bool runOnMachineFunction(MachineFunction &MF) override;
115
116 void getAnalysisUsage(AnalysisUsage &AU) const override {
117 MachineFunctionPass::getAnalysisUsage(AU);
118 AU.addRequired<AAResultsWrapperPass>();
119 }
120};
121
122} // end anonymous namespace
123
124char X86AvoidSFBLegacy::ID = 0;
125
126INITIALIZE_PASS_BEGIN(X86AvoidSFBLegacy, DEBUG_TYPE, "Machine code sinking",
127 false, false)
128INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
129INITIALIZE_PASS_END(X86AvoidSFBLegacy, DEBUG_TYPE, "Machine code sinking",
130 false, false)
131
132FunctionPass *llvm::createX86AvoidStoreForwardingBlocksLegacyPass() {
133 return new X86AvoidSFBLegacy();
134}
135
136static bool isXMMLoadOpcode(unsigned Opcode) {
137 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
138 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
139 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
140 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
141 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
142 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
143 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
144 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
145}
146static bool isYMMLoadOpcode(unsigned Opcode) {
147 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
148 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
149 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
150 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
151 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
152 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
153 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
154}
155
156static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
157 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
158}
159
160static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
161 switch (LdOpcode) {
162 case X86::MOVUPSrm:
163 case X86::MOVAPSrm:
164 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
165 case X86::VMOVUPSrm:
166 case X86::VMOVAPSrm:
167 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
168 case X86::VMOVUPDrm:
169 case X86::VMOVAPDrm:
170 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
171 case X86::VMOVDQUrm:
172 case X86::VMOVDQArm:
173 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
174 case X86::VMOVUPSZ128rm:
175 case X86::VMOVAPSZ128rm:
176 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
177 case X86::VMOVUPDZ128rm:
178 case X86::VMOVAPDZ128rm:
179 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
180 case X86::VMOVUPSYrm:
181 case X86::VMOVAPSYrm:
182 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
183 case X86::VMOVUPDYrm:
184 case X86::VMOVAPDYrm:
185 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
186 case X86::VMOVDQUYrm:
187 case X86::VMOVDQAYrm:
188 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
189 case X86::VMOVUPSZ256rm:
190 case X86::VMOVAPSZ256rm:
191 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
192 case X86::VMOVUPDZ256rm:
193 case X86::VMOVAPDZ256rm:
194 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
195 case X86::VMOVDQU64Z128rm:
196 case X86::VMOVDQA64Z128rm:
197 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
198 case X86::VMOVDQU32Z128rm:
199 case X86::VMOVDQA32Z128rm:
200 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
201 case X86::VMOVDQU64Z256rm:
202 case X86::VMOVDQA64Z256rm:
203 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
204 case X86::VMOVDQU32Z256rm:
205 case X86::VMOVDQA32Z256rm:
206 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
207 default:
208 return false;
209 }
210}
211
212static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
213 bool PBlock = false;
214 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
215 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
216 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
217 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
218 if (isYMMLoadOpcode(Opcode: LoadOpcode))
219 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
220 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
221 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
222 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
223 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
224 Opcode == X86::VMOVDQU64Z128mr ||
225 Opcode == X86::VMOVDQA64Z128mr ||
226 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
227 return PBlock;
228}
229
230static const int MOV128SZ = 16;
231static const int MOV64SZ = 8;
232static const int MOV32SZ = 4;
233static const int MOV16SZ = 2;
234static const int MOV8SZ = 1;
235
236static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
237 switch (LoadOpcode) {
238 case X86::VMOVUPSYrm:
239 case X86::VMOVAPSYrm:
240 return X86::VMOVUPSrm;
241 case X86::VMOVUPDYrm:
242 case X86::VMOVAPDYrm:
243 return X86::VMOVUPDrm;
244 case X86::VMOVDQUYrm:
245 case X86::VMOVDQAYrm:
246 return X86::VMOVDQUrm;
247 case X86::VMOVUPSZ256rm:
248 case X86::VMOVAPSZ256rm:
249 return X86::VMOVUPSZ128rm;
250 case X86::VMOVUPDZ256rm:
251 case X86::VMOVAPDZ256rm:
252 return X86::VMOVUPDZ128rm;
253 case X86::VMOVDQU64Z256rm:
254 case X86::VMOVDQA64Z256rm:
255 return X86::VMOVDQU64Z128rm;
256 case X86::VMOVDQU32Z256rm:
257 case X86::VMOVDQA32Z256rm:
258 return X86::VMOVDQU32Z128rm;
259 default:
260 llvm_unreachable("Unexpected Load Instruction Opcode");
261 }
262 return 0;
263}
264
265static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
266 switch (StoreOpcode) {
267 case X86::VMOVUPSYmr:
268 case X86::VMOVAPSYmr:
269 return X86::VMOVUPSmr;
270 case X86::VMOVUPDYmr:
271 case X86::VMOVAPDYmr:
272 return X86::VMOVUPDmr;
273 case X86::VMOVDQUYmr:
274 case X86::VMOVDQAYmr:
275 return X86::VMOVDQUmr;
276 case X86::VMOVUPSZ256mr:
277 case X86::VMOVAPSZ256mr:
278 return X86::VMOVUPSZ128mr;
279 case X86::VMOVUPDZ256mr:
280 case X86::VMOVAPDZ256mr:
281 return X86::VMOVUPDZ128mr;
282 case X86::VMOVDQU64Z256mr:
283 case X86::VMOVDQA64Z256mr:
284 return X86::VMOVDQU64Z128mr;
285 case X86::VMOVDQU32Z256mr:
286 case X86::VMOVDQA32Z256mr:
287 return X86::VMOVDQU32Z128mr;
288 default:
289 llvm_unreachable("Unexpected Load Instruction Opcode");
290 }
291 return 0;
292}
293
294static int getAddrOffset(const MachineInstr *MI) {
295 const MCInstrDesc &Descl = MI->getDesc();
296 int AddrOffset = X86II::getMemoryOperandNo(TSFlags: Descl.TSFlags);
297 assert(AddrOffset != -1 && "Expected Memory Operand");
298 AddrOffset += X86II::getOperandBias(Desc: Descl);
299 return AddrOffset;
300}
301
302static MachineOperand &getBaseOperand(MachineInstr *MI) {
303 int AddrOffset = getAddrOffset(MI);
304 return MI->getOperand(i: AddrOffset + X86::AddrBaseReg);
305}
306
307static MachineOperand &getDispOperand(MachineInstr *MI) {
308 int AddrOffset = getAddrOffset(MI);
309 return MI->getOperand(i: AddrOffset + X86::AddrDisp);
310}
311
312// Relevant addressing modes contain only base register and immediate
313// displacement or frameindex and immediate displacement.
314// TODO: Consider expanding to other addressing modes in the future
315static bool isRelevantAddressingMode(MachineInstr *MI) {
316 int AddrOffset = getAddrOffset(MI);
317 const MachineOperand &Base = getBaseOperand(MI);
318 const MachineOperand &Disp = getDispOperand(MI);
319 const MachineOperand &Scale = MI->getOperand(i: AddrOffset + X86::AddrScaleAmt);
320 const MachineOperand &Index = MI->getOperand(i: AddrOffset + X86::AddrIndexReg);
321 const MachineOperand &Segment = MI->getOperand(i: AddrOffset + X86::AddrSegmentReg);
322
323 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
324 return false;
325 if (!Disp.isImm())
326 return false;
327 if (Scale.getImm() != 1)
328 return false;
329 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
330 return false;
331 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
332 return false;
333 return true;
334}
335
336// Collect potentially blocking stores.
337// Limit the number of instructions backwards we want to inspect
338// since the effect of store block won't be visible if the store
339// and load instructions have enough instructions in between to
340// keep the core busy.
341static SmallVector<MachineInstr *, 2>
342findPotentialBlockers(MachineInstr *LoadInst) {
343 SmallVector<MachineInstr *, 2> PotentialBlockers;
344 unsigned BlockCount = 0;
345 const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
346 for (auto PBInst = std::next(x: MachineBasicBlock::reverse_iterator(LoadInst)),
347 E = LoadInst->getParent()->rend();
348 PBInst != E; ++PBInst) {
349 if (PBInst->isMetaInstruction())
350 continue;
351 BlockCount++;
352 if (BlockCount >= InspectionLimit)
353 break;
354 MachineInstr &MI = *PBInst;
355 if (MI.getDesc().isCall())
356 return PotentialBlockers;
357 PotentialBlockers.push_back(Elt: &MI);
358 }
359 // If we didn't get to the instructions limit try predecessing blocks.
360 // Ideally we should traverse the predecessor blocks in depth with some
361 // coloring algorithm, but for now let's just look at the first order
362 // predecessors.
363 if (BlockCount < InspectionLimit) {
364 MachineBasicBlock *MBB = LoadInst->getParent();
365 int LimitLeft = InspectionLimit - BlockCount;
366 for (MachineBasicBlock *PMBB : MBB->predecessors()) {
367 int PredCount = 0;
368 for (MachineInstr &PBInst : llvm::reverse(C&: *PMBB)) {
369 if (PBInst.isMetaInstruction())
370 continue;
371 PredCount++;
372 if (PredCount >= LimitLeft)
373 break;
374 if (PBInst.getDesc().isCall())
375 break;
376 PotentialBlockers.push_back(Elt: &PBInst);
377 }
378 }
379 }
380 return PotentialBlockers;
381}
382
383void X86AvoidSFBImpl::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
384 int64_t LoadDisp, MachineInstr *StoreInst,
385 unsigned NStoreOpcode, int64_t StoreDisp,
386 unsigned Size, int64_t Offset) {
387 MachineOperand &LoadBase = getBaseOperand(MI: LoadInst);
388 MachineOperand &StoreBase = getBaseOperand(MI: StoreInst);
389 MachineBasicBlock *MBB = LoadInst->getParent();
390 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
391 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
392
393 Register Reg1 =
394 MRI->createVirtualRegister(RegClass: TII->getRegClass(MCID: TII->get(Opcode: NLoadOpcode), OpNum: 0));
395 MachineInstr *NewLoad =
396 BuildMI(BB&: *MBB, I: LoadInst, MIMD: LoadInst->getDebugLoc(), MCID: TII->get(Opcode: NLoadOpcode),
397 DestReg: Reg1)
398 .add(MO: LoadBase)
399 .addImm(Val: 1)
400 .addReg(RegNo: X86::NoRegister)
401 .addImm(Val: LoadDisp)
402 .addReg(RegNo: X86::NoRegister)
403 .addMemOperand(
404 MMO: MBB->getParent()->getMachineMemOperand(MMO: LMMO, Offset, Size));
405 if (LoadBase.isReg())
406 getBaseOperand(MI: NewLoad).setIsKill(false);
407 LLVM_DEBUG(NewLoad->dump());
408 // If the load and store are consecutive, use the loadInst location to
409 // reduce register pressure.
410 MachineInstr *StInst = StoreInst;
411 auto PrevInstrIt = prev_nodbg(It: MachineBasicBlock::instr_iterator(StoreInst),
412 Begin: MBB->instr_begin());
413 if (PrevInstrIt.getNodePtr() == LoadInst)
414 StInst = LoadInst;
415 MachineInstr *NewStore =
416 BuildMI(BB&: *MBB, I: StInst, MIMD: StInst->getDebugLoc(), MCID: TII->get(Opcode: NStoreOpcode))
417 .add(MO: StoreBase)
418 .addImm(Val: 1)
419 .addReg(RegNo: X86::NoRegister)
420 .addImm(Val: StoreDisp)
421 .addReg(RegNo: X86::NoRegister)
422 .addReg(RegNo: Reg1)
423 .addMemOperand(
424 MMO: MBB->getParent()->getMachineMemOperand(MMO: SMMO, Offset, Size));
425 if (StoreBase.isReg())
426 getBaseOperand(MI: NewStore).setIsKill(false);
427 MachineOperand &StoreSrcVReg = StoreInst->getOperand(i: X86::AddrNumOperands);
428 assert(StoreSrcVReg.isReg() && "Expected virtual register");
429 NewStore->getOperand(i: X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
430 LLVM_DEBUG(NewStore->dump());
431}
432
433void X86AvoidSFBImpl::buildCopies(int Size, MachineInstr *LoadInst,
434 int64_t LdDispImm, MachineInstr *StoreInst,
435 int64_t StDispImm, int64_t Offset) {
436 int LdDisp = LdDispImm;
437 int StDisp = StDispImm;
438 while (Size > 0) {
439 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(Opcode: LoadInst->getOpcode())) {
440 Size = Size - MOV128SZ;
441 buildCopy(LoadInst, NLoadOpcode: getYMMtoXMMLoadOpcode(LoadOpcode: LoadInst->getOpcode()), LoadDisp: LdDisp,
442 StoreInst, NStoreOpcode: getYMMtoXMMStoreOpcode(StoreOpcode: StoreInst->getOpcode()),
443 StoreDisp: StDisp, Size: MOV128SZ, Offset);
444 LdDisp += MOV128SZ;
445 StDisp += MOV128SZ;
446 Offset += MOV128SZ;
447 continue;
448 }
449 if (Size - MOV64SZ >= 0) {
450 Size = Size - MOV64SZ;
451 buildCopy(LoadInst, NLoadOpcode: X86::MOV64rm, LoadDisp: LdDisp, StoreInst, NStoreOpcode: X86::MOV64mr, StoreDisp: StDisp,
452 Size: MOV64SZ, Offset);
453 LdDisp += MOV64SZ;
454 StDisp += MOV64SZ;
455 Offset += MOV64SZ;
456 continue;
457 }
458 if (Size - MOV32SZ >= 0) {
459 Size = Size - MOV32SZ;
460 buildCopy(LoadInst, NLoadOpcode: X86::MOV32rm, LoadDisp: LdDisp, StoreInst, NStoreOpcode: X86::MOV32mr, StoreDisp: StDisp,
461 Size: MOV32SZ, Offset);
462 LdDisp += MOV32SZ;
463 StDisp += MOV32SZ;
464 Offset += MOV32SZ;
465 continue;
466 }
467 if (Size - MOV16SZ >= 0) {
468 Size = Size - MOV16SZ;
469 buildCopy(LoadInst, NLoadOpcode: X86::MOV16rm, LoadDisp: LdDisp, StoreInst, NStoreOpcode: X86::MOV16mr, StoreDisp: StDisp,
470 Size: MOV16SZ, Offset);
471 LdDisp += MOV16SZ;
472 StDisp += MOV16SZ;
473 Offset += MOV16SZ;
474 continue;
475 }
476 if (Size - MOV8SZ >= 0) {
477 Size = Size - MOV8SZ;
478 buildCopy(LoadInst, NLoadOpcode: X86::MOV8rm, LoadDisp: LdDisp, StoreInst, NStoreOpcode: X86::MOV8mr, StoreDisp: StDisp,
479 Size: MOV8SZ, Offset);
480 LdDisp += MOV8SZ;
481 StDisp += MOV8SZ;
482 Offset += MOV8SZ;
483 continue;
484 }
485 }
486 assert(Size == 0 && "Wrong size division");
487}
488
489static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
490 MachineOperand &LoadBase = getBaseOperand(MI: LoadInst);
491 MachineOperand &StoreBase = getBaseOperand(MI: StoreInst);
492 auto *StorePrevNonDbgInstr =
493 prev_nodbg(It: MachineBasicBlock::instr_iterator(StoreInst),
494 Begin: LoadInst->getParent()->instr_begin())
495 .getNodePtr();
496 if (LoadBase.isReg()) {
497 MachineInstr *LastLoad = LoadInst->getPrevNode();
498 // If the original load and store to xmm/ymm were consecutive
499 // then the partial copies were also created in
500 // a consecutive order to reduce register pressure,
501 // and the location of the last load is before the last store.
502 if (StorePrevNonDbgInstr == LoadInst)
503 LastLoad = LoadInst->getPrevNode()->getPrevNode();
504 getBaseOperand(MI: LastLoad).setIsKill(LoadBase.isKill());
505 }
506 if (StoreBase.isReg()) {
507 MachineInstr *StInst = StoreInst;
508 if (StorePrevNonDbgInstr == LoadInst)
509 StInst = LoadInst;
510 getBaseOperand(MI: StInst->getPrevNode()).setIsKill(StoreBase.isKill());
511 }
512}
513
514bool X86AvoidSFBImpl::alias(const MachineMemOperand &Op1,
515 const MachineMemOperand &Op2) const {
516 if (!Op1.getValue() || !Op2.getValue())
517 return true;
518
519 int64_t MinOffset = std::min(a: Op1.getOffset(), b: Op2.getOffset());
520 int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
521 int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
522
523 return !AA->isNoAlias(
524 LocA: MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
525 LocB: MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
526}
527
528void X86AvoidSFBImpl::findPotentiallylBlockedCopies(MachineFunction &MF) {
529 for (auto &MBB : MF)
530 for (auto &MI : MBB) {
531 if (!isPotentialBlockedMemCpyLd(Opcode: MI.getOpcode()))
532 continue;
533 Register DefVR = MI.getOperand(i: 0).getReg();
534 if (!MRI->hasOneNonDBGUse(RegNo: DefVR))
535 continue;
536 for (MachineOperand &StoreMO :
537 llvm::make_early_inc_range(Range: MRI->use_nodbg_operands(Reg: DefVR))) {
538 MachineInstr &StoreMI = *StoreMO.getParent();
539 // Skip cases where the memcpy may overlap.
540 if (StoreMI.getParent() == MI.getParent() &&
541 isPotentialBlockedMemCpyPair(LdOpcode: MI.getOpcode(), StOpcode: StoreMI.getOpcode()) &&
542 isRelevantAddressingMode(MI: &MI) &&
543 isRelevantAddressingMode(MI: &StoreMI) &&
544 MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
545 // Don't split volatile or atomic accesses.
546 const MachineMemOperand *LMMO = *MI.memoperands_begin();
547 const MachineMemOperand *SMMO = *StoreMI.memoperands_begin();
548 if (LMMO->isVolatile() || LMMO->isAtomic() || SMMO->isVolatile() ||
549 SMMO->isAtomic())
550 continue;
551 if (!alias(Op1: *LMMO, Op2: *SMMO))
552 BlockedLoadsStoresPairs.push_back(Elt: std::make_pair(x: &MI, y: &StoreMI));
553 }
554 }
555 }
556}
557
558unsigned X86AvoidSFBImpl::getRegSizeInBytes(MachineInstr *LoadInst) {
559 const auto *TRC = TII->getRegClass(MCID: TII->get(Opcode: LoadInst->getOpcode()), OpNum: 0);
560 return TRI->getRegSizeInBits(RC: *TRC) / 8;
561}
562
563void X86AvoidSFBImpl::breakBlockedCopies(
564 MachineInstr *LoadInst, MachineInstr *StoreInst,
565 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
566 int64_t LdDispImm = getDispOperand(MI: LoadInst).getImm();
567 int64_t StDispImm = getDispOperand(MI: StoreInst).getImm();
568 int64_t Offset = 0;
569
570 int64_t LdDisp1 = LdDispImm;
571 int64_t LdDisp2 = 0;
572 int64_t StDisp1 = StDispImm;
573 int64_t StDisp2 = 0;
574 unsigned Size1 = 0;
575 unsigned Size2 = 0;
576 int64_t LdStDelta = StDispImm - LdDispImm;
577
578 for (auto DispSizePair : BlockingStoresDispSizeMap) {
579 LdDisp2 = DispSizePair.first;
580 StDisp2 = DispSizePair.first + LdStDelta;
581 Size2 = DispSizePair.second;
582 // Avoid copying overlapping areas.
583 if (LdDisp2 < LdDisp1) {
584 int OverlapDelta = LdDisp1 - LdDisp2;
585 LdDisp2 += OverlapDelta;
586 StDisp2 += OverlapDelta;
587 Size2 -= OverlapDelta;
588 }
589 Size1 = LdDisp2 - LdDisp1;
590
591 // Build a copy for the point until the current blocking store's
592 // displacement.
593 buildCopies(Size: Size1, LoadInst, LdDispImm: LdDisp1, StoreInst, StDispImm: StDisp1, Offset);
594 // Build a copy for the current blocking store.
595 buildCopies(Size: Size2, LoadInst, LdDispImm: LdDisp2, StoreInst, StDispImm: StDisp2, Offset: Offset + Size1);
596 LdDisp1 = LdDisp2 + Size2;
597 StDisp1 = StDisp2 + Size2;
598 Offset += Size1 + Size2;
599 }
600 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
601 buildCopies(Size: Size3, LoadInst, LdDispImm: LdDisp1, StoreInst, StDispImm: StDisp1, Offset);
602}
603
604static bool hasSameBaseOpValue(MachineInstr *LoadInst,
605 MachineInstr *StoreInst) {
606 const MachineOperand &LoadBase = getBaseOperand(MI: LoadInst);
607 const MachineOperand &StoreBase = getBaseOperand(MI: StoreInst);
608 if (LoadBase.isReg() != StoreBase.isReg())
609 return false;
610 if (LoadBase.isReg())
611 return LoadBase.getReg() == StoreBase.getReg();
612 return LoadBase.getIndex() == StoreBase.getIndex();
613}
614
615static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
616 int64_t StoreDispImm, unsigned StoreSize) {
617 return ((StoreDispImm >= LoadDispImm) &&
618 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
619}
620
621// Keep track of all stores blocking a load
622static void
623updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
624 int64_t DispImm, unsigned Size) {
625 auto [It, Inserted] = BlockingStoresDispSizeMap.try_emplace(k: DispImm, args&: Size);
626 // Choose the smallest blocking store starting at this displacement.
627 if (!Inserted && It->second > Size)
628 It->second = Size;
629}
630
631// Remove blocking stores contained in each other.
632static void
633removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
634 if (BlockingStoresDispSizeMap.size() <= 1)
635 return;
636
637 SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
638 for (auto DispSizePair : BlockingStoresDispSizeMap) {
639 int64_t CurrDisp = DispSizePair.first;
640 unsigned CurrSize = DispSizePair.second;
641 while (DispSizeStack.size()) {
642 int64_t PrevDisp = DispSizeStack.back().first;
643 unsigned PrevSize = DispSizeStack.back().second;
644 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
645 break;
646 DispSizeStack.pop_back();
647 }
648 DispSizeStack.push_back(Elt: DispSizePair);
649 }
650 BlockingStoresDispSizeMap.clear();
651 for (auto Disp : DispSizeStack)
652 BlockingStoresDispSizeMap.insert(x&: Disp);
653}
654
655bool X86AvoidSFBImpl::runOnMachineFunction(MachineFunction &MF) {
656 bool Changed = false;
657
658 if (DisableX86AvoidStoreForwardBlocks ||
659 !MF.getSubtarget<X86Subtarget>().is64Bit())
660 return false;
661
662 MRI = &MF.getRegInfo();
663 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
664 TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
665 TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
666 LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
667 // Look for a load then a store to XMM/YMM which look like a memcpy
668 findPotentiallylBlockedCopies(MF);
669
670 for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
671 MachineInstr *LoadInst = LoadStoreInstPair.first;
672 int64_t LdDispImm = getDispOperand(MI: LoadInst).getImm();
673 DisplacementSizeMap BlockingStoresDispSizeMap;
674
675 SmallVector<MachineInstr *, 2> PotentialBlockers =
676 findPotentialBlockers(LoadInst);
677 for (auto *PBInst : PotentialBlockers) {
678 if (!isPotentialBlockingStoreInst(Opcode: PBInst->getOpcode(),
679 LoadOpcode: LoadInst->getOpcode()) ||
680 !isRelevantAddressingMode(MI: PBInst) || !PBInst->hasOneMemOperand())
681 continue;
682 int64_t PBstDispImm = getDispOperand(MI: PBInst).getImm();
683 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
684 // This check doesn't cover all cases, but it will suffice for now.
685 // TODO: take branch probability into consideration, if the blocking
686 // store is in an unreached block, breaking the memcopy could lose
687 // performance.
688 if (hasSameBaseOpValue(LoadInst, StoreInst: PBInst) &&
689 isBlockingStore(LoadDispImm: LdDispImm, LoadSize: getRegSizeInBytes(LoadInst), StoreDispImm: PBstDispImm,
690 StoreSize: PBstSize))
691 updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, DispImm: PBstDispImm,
692 Size: PBstSize);
693 }
694
695 if (BlockingStoresDispSizeMap.empty())
696 continue;
697
698 // We found a store forward block, break the memcpy's load and store
699 // into smaller copies such that each smaller store that was causing
700 // a store block would now be copied separately.
701 MachineInstr *StoreInst = LoadStoreInstPair.second;
702 LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
703 LLVM_DEBUG(LoadInst->dump());
704 LLVM_DEBUG(StoreInst->dump());
705 LLVM_DEBUG(dbgs() << "Replaced with:\n");
706 removeRedundantBlockingStores(BlockingStoresDispSizeMap);
707 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
708 updateKillStatus(LoadInst, StoreInst);
709 ForRemoval.push_back(Elt: LoadInst);
710 ForRemoval.push_back(Elt: StoreInst);
711 }
712 for (auto *RemovedInst : ForRemoval) {
713 RemovedInst->eraseFromParent();
714 }
715 ForRemoval.clear();
716 BlockedLoadsStoresPairs.clear();
717 LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
718
719 return Changed;
720}
721
722bool X86AvoidSFBLegacy::runOnMachineFunction(MachineFunction &MF) {
723 if (skipFunction(F: MF.getFunction()))
724 return false;
725 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
726 X86AvoidSFBImpl Impl(AA);
727 return Impl.runOnMachineFunction(MF);
728}
729
730PreservedAnalyses
731X86AvoidStoreForwardingBlocksPass::run(MachineFunction &MF,
732 MachineFunctionAnalysisManager &MFAM) {
733 AliasAnalysis *AA =
734 &MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
735 .getManager()
736 .getResult<AAManager>(IR&: MF.getFunction());
737 X86AvoidSFBImpl Impl(AA);
738 bool Changed = Impl.runOnMachineFunction(MF);
739 return Changed ? getMachineFunctionPassPreservedAnalyses()
740 : PreservedAnalyses::all();
741}
742