1//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the transformation that optimizes memory intrinsics
10// such as memcpy using the size value profile. When memory intrinsic size
11// value profile metadata is available, a single memory intrinsic is expanded
12// to a sequence of guarded specialized versions that are called with the
13// hottest size(s), for later expansion into more optimal inline sequences.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/ADT/ArrayRef.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/Twine.h"
21#include "llvm/Analysis/BlockFrequencyInfo.h"
22#include "llvm/Analysis/DomTreeUpdater.h"
23#include "llvm/Analysis/OptimizationRemarkEmitter.h"
24#include "llvm/Analysis/TargetLibraryInfo.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/DerivedTypes.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/Function.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/InstVisitor.h"
31#include "llvm/IR/Instruction.h"
32#include "llvm/IR/Instructions.h"
33#include "llvm/IR/LLVMContext.h"
34#include "llvm/IR/PassManager.h"
35#include "llvm/IR/Type.h"
36#include "llvm/ProfileData/InstrProf.h"
37#define INSTR_PROF_VALUE_PROF_MEMOP_API
38#include "llvm/ProfileData/InstrProfData.inc"
39#include "llvm/Support/Casting.h"
40#include "llvm/Support/CommandLine.h"
41#include "llvm/Support/Debug.h"
42#include "llvm/Support/ErrorHandling.h"
43#include "llvm/Support/MathExtras.h"
44#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
45#include "llvm/Transforms/Utils/BasicBlockUtils.h"
46#include <cassert>
47#include <cstdint>
48#include <vector>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "pgo-memop-opt"
53
54STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
55STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
56
57// The minimum call count to optimize memory intrinsic calls.
58static cl::opt<unsigned>
59 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(Val: 1000),
60 cl::desc("The minimum count to optimize memory "
61 "intrinsic calls"));
62
63// Command line option to disable memory intrinsic optimization. The default is
64// false. This is for debug purpose.
65static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(Val: false),
66 cl::Hidden, cl::desc("Disable optimize"));
67
68// The percent threshold to optimize memory intrinsic calls.
69static cl::opt<unsigned>
70 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(Val: 40),
71 cl::Hidden,
72 cl::desc("The percentage threshold for the "
73 "memory intrinsic calls optimization"));
74
75// Maximum number of versions for optimizing memory intrinsic call.
76static cl::opt<unsigned>
77 MemOPMaxVersion("pgo-memop-max-version", cl::init(Val: 3), cl::Hidden,
78 cl::desc("The max version for the optimized memory "
79 " intrinsic calls"));
80
81// Scale the counts from the annotation using the BB count value.
82static cl::opt<bool>
83 MemOPScaleCount("pgo-memop-scale-count", cl::init(Val: true), cl::Hidden,
84 cl::desc("Scale the memop size counts using the basic "
85 " block count value"));
86
87cl::opt<bool>
88 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(Val: true),
89 cl::Hidden,
90 cl::desc("Size-specialize memcmp and bcmp calls"));
91
92static cl::opt<unsigned>
93 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(Val: 128),
94 cl::desc("Optimize the memop size <= this value"));
95
96namespace {
97
98static const char *getMIName(const MemIntrinsic *MI) {
99 switch (MI->getIntrinsicID()) {
100 case Intrinsic::memcpy:
101 return "memcpy";
102 case Intrinsic::memmove:
103 return "memmove";
104 case Intrinsic::memset:
105 return "memset";
106 default:
107 return "unknown";
108 }
109}
110
111// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
112struct MemOp {
113 Instruction *I;
114 MemOp(MemIntrinsic *MI) : I(MI) {}
115 MemOp(CallInst *CI) : I(CI) {}
116 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(Val: I); }
117 CallInst *asCI() { return cast<CallInst>(Val: I); }
118 MemOp clone() {
119 if (auto MI = asMI())
120 return MemOp(cast<MemIntrinsic>(Val: MI->clone()));
121 return MemOp(cast<CallInst>(Val: asCI()->clone()));
122 }
123 Value *getLength() {
124 if (auto MI = asMI())
125 return MI->getLength();
126 return asCI()->getArgOperand(i: 2);
127 }
128 void setLength(Value *Length) {
129 if (auto MI = asMI())
130 return MI->setLength(Length);
131 asCI()->setArgOperand(i: 2, v: Length);
132 }
133 StringRef getFuncName() {
134 if (auto MI = asMI())
135 return MI->getCalledFunction()->getName();
136 return asCI()->getCalledFunction()->getName();
137 }
138 bool isMemmove() {
139 if (auto MI = asMI())
140 if (MI->getIntrinsicID() == Intrinsic::memmove)
141 return true;
142 return false;
143 }
144 bool isMemcmp(TargetLibraryInfo &TLI) {
145 LibFunc Func;
146 if (asMI() == nullptr && TLI.getLibFunc(CB: *asCI(), F&: Func) &&
147 Func == LibFunc_memcmp) {
148 return true;
149 }
150 return false;
151 }
152 bool isBcmp(TargetLibraryInfo &TLI) {
153 LibFunc Func;
154 if (asMI() == nullptr && TLI.getLibFunc(CB: *asCI(), F&: Func) &&
155 Func == LibFunc_bcmp) {
156 return true;
157 }
158 return false;
159 }
160 const char *getName(TargetLibraryInfo &TLI) {
161 if (auto MI = asMI())
162 return getMIName(MI);
163 LibFunc Func;
164 if (TLI.getLibFunc(CB: *asCI(), F&: Func)) {
165 if (Func == LibFunc_memcmp)
166 return "memcmp";
167 if (Func == LibFunc_bcmp)
168 return "bcmp";
169 }
170 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
171 return nullptr;
172 }
173};
174
175class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
176public:
177 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
178 OptimizationRemarkEmitter &ORE, DominatorTree *DT,
179 TargetLibraryInfo &TLI)
180 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {}
181 bool isChanged() const { return Changed; }
182 void perform() {
183 WorkList.clear();
184 visit(F&: Func);
185
186 for (auto &MO : WorkList) {
187 ++NumOfPGOMemOPAnnotate;
188 if (perform(MO)) {
189 Changed = true;
190 ++NumOfPGOMemOPOpt;
191 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
192 << "is Transformed.\n");
193 }
194 }
195 }
196
197 void visitMemIntrinsic(MemIntrinsic &MI) {
198 Value *Length = MI.getLength();
199 // Not perform on constant length calls.
200 if (isa<ConstantInt>(Val: Length))
201 return;
202 WorkList.push_back(x: MemOp(&MI));
203 }
204
205 void visitCallInst(CallInst &CI) {
206 LibFunc Func;
207 if (TLI.getLibFunc(CB: CI, F&: Func) &&
208 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
209 !isa<ConstantInt>(Val: CI.getArgOperand(i: 2))) {
210 WorkList.push_back(x: MemOp(&CI));
211 }
212 }
213
214private:
215 Function &Func;
216 BlockFrequencyInfo &BFI;
217 OptimizationRemarkEmitter &ORE;
218 DominatorTree *DT;
219 TargetLibraryInfo &TLI;
220 bool Changed;
221 std::vector<MemOp> WorkList;
222 bool perform(MemOp MO);
223};
224
225static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
226 assert(Count <= TotalCount);
227 if (Count < MemOPCountThreshold)
228 return false;
229 if (Count < TotalCount * MemOPPercentThreshold / 100)
230 return false;
231 return true;
232}
233
234static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
235 uint64_t Denom) {
236 if (!MemOPScaleCount)
237 return Count;
238 bool Overflowed;
239 uint64_t ScaleCount = SaturatingMultiply(X: Count, Y: Num, ResultOverflowed: &Overflowed);
240 return ScaleCount / Denom;
241}
242
243bool MemOPSizeOpt::perform(MemOp MO) {
244 assert(MO.I);
245 if (MO.isMemmove())
246 return false;
247 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
248 return false;
249
250 uint32_t MaxNumVals = INSTR_PROF_NUM_BUCKETS;
251 uint64_t TotalCount;
252 auto VDs =
253 getValueProfDataFromInst(Inst: *MO.I, ValueKind: IPVK_MemOPSize, MaxNumValueData: MaxNumVals, TotalC&: TotalCount);
254 if (VDs.empty())
255 return false;
256
257 uint64_t ActualCount = TotalCount;
258 uint64_t SavedTotalCount = TotalCount;
259 if (MemOPScaleCount) {
260 auto BBEdgeCount = BFI.getBlockProfileCount(BB: MO.I->getParent());
261 if (!BBEdgeCount)
262 return false;
263 ActualCount = *BBEdgeCount;
264 }
265
266 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
267 << ActualCount << "\n");
268 LLVM_DEBUG(
269 for (auto &VD
270 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
271
272 if (ActualCount < MemOPCountThreshold)
273 return false;
274 // Skip if the total value profiled count is 0, in which case we can't
275 // scale up the counts properly (and there is no profitable transformation).
276 if (TotalCount == 0)
277 return false;
278
279 TotalCount = ActualCount;
280 if (MemOPScaleCount)
281 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
282 << " denominator = " << SavedTotalCount << "\n");
283
284 // Keeping track of the count of the default case:
285 uint64_t RemainCount = TotalCount;
286 uint64_t SavedRemainCount = SavedTotalCount;
287 SmallVector<uint64_t, 16> SizeIds;
288 SmallVector<uint64_t, 16> CaseCounts;
289 SmallDenseSet<uint64_t, 16> SeenSizeId;
290 uint64_t MaxCount = 0;
291 unsigned Version = 0;
292 // Default case is in the front -- save the slot here.
293 CaseCounts.push_back(Elt: 0);
294 SmallVector<InstrProfValueData, 24> RemainingVDs;
295 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
296 auto &VD = *I;
297 int64_t V = VD.Value;
298 uint64_t C = VD.Count;
299 if (MemOPScaleCount)
300 C = getScaledCount(Count: C, Num: ActualCount, Denom: SavedTotalCount);
301
302 if (!InstrProfIsSingleValRange(Value: V) || V > MemOpMaxOptSize) {
303 RemainingVDs.push_back(Elt: VD);
304 continue;
305 }
306
307 // ValueCounts are sorted on the count. Break at the first un-profitable
308 // value.
309 if (!isProfitable(Count: C, TotalCount: RemainCount)) {
310 RemainingVDs.insert(I: RemainingVDs.end(), From: I, To: E);
311 break;
312 }
313
314 if (!SeenSizeId.insert(V).second) {
315 errs() << "warning: Invalid Profile Data in Function " << Func.getName()
316 << ": Two identical values in MemOp value counts.\n";
317 return false;
318 }
319
320 SizeIds.push_back(Elt: V);
321 CaseCounts.push_back(Elt: C);
322 if (C > MaxCount)
323 MaxCount = C;
324
325 assert(RemainCount >= C);
326 RemainCount -= C;
327 assert(SavedRemainCount >= VD.Count);
328 SavedRemainCount -= VD.Count;
329
330 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
331 RemainingVDs.insert(I: RemainingVDs.end(), From: I + 1, To: E);
332 break;
333 }
334 }
335
336 if (Version == 0)
337 return false;
338
339 CaseCounts[0] = RemainCount;
340 if (RemainCount > MaxCount)
341 MaxCount = RemainCount;
342
343 uint64_t SumForOpt = TotalCount - RemainCount;
344
345 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
346 << " Versions (covering " << SumForOpt << " out of "
347 << TotalCount << ")\n");
348
349 // mem_op(..., size)
350 // ==>
351 // switch (size) {
352 // case s1:
353 // mem_op(..., s1);
354 // goto merge_bb;
355 // case s2:
356 // mem_op(..., s2);
357 // goto merge_bb;
358 // ...
359 // default:
360 // mem_op(..., size);
361 // goto merge_bb;
362 // }
363 // merge_bb:
364
365 BasicBlock *BB = MO.I->getParent();
366 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
367 LLVM_DEBUG(dbgs() << *BB << "\n");
368 auto OrigBBFreq = BFI.getBlockFreq(BB);
369
370 BasicBlock *DefaultBB = SplitBlock(Old: BB, SplitPt: MO.I, DT);
371 BasicBlock::iterator It(*MO.I);
372 ++It;
373 assert(It != DefaultBB->end());
374 BasicBlock *MergeBB = SplitBlock(Old: DefaultBB, SplitPt: &(*It), DT);
375 MergeBB->setName("MemOP.Merge");
376 BFI.setBlockFreq(BB: MergeBB, Freq: OrigBBFreq);
377 DefaultBB->setName("MemOP.Default");
378
379 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
380 auto &Ctx = Func.getContext();
381 IRBuilder<> IRB(BB);
382 BB->getTerminator()->eraseFromParent();
383 Value *SizeVar = MO.getLength();
384 SwitchInst *SI = IRB.CreateSwitch(V: SizeVar, Dest: DefaultBB, NumCases: SizeIds.size());
385 Type *MemOpTy = MO.I->getType();
386 PHINode *PHI = nullptr;
387 if (!MemOpTy->isVoidTy()) {
388 // Insert a phi for the return values at the merge block.
389 IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
390 PHI = IRBM.CreatePHI(Ty: MemOpTy, NumReservedValues: SizeIds.size() + 1, Name: "MemOP.RVMerge");
391 MO.I->replaceAllUsesWith(V: PHI);
392 PHI->addIncoming(V: MO.I, BB: DefaultBB);
393 }
394
395 // Clear the value profile data.
396 MO.I->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr);
397 // If all promoted, we don't need the MD.prof metadata.
398 if (SavedRemainCount > 0 || Version != VDs.size()) {
399 // Otherwise we need update with the un-promoted records back.
400 annotateValueSite(M&: *Func.getParent(), Inst&: *MO.I, VDs: RemainingVDs, Sum: SavedRemainCount,
401 ValueKind: IPVK_MemOPSize, MaxMDCount: VDs.size());
402 }
403
404 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
405
406 std::vector<DominatorTree::UpdateType> Updates;
407 if (DT)
408 Updates.reserve(n: 2 * SizeIds.size());
409
410 for (uint64_t SizeId : SizeIds) {
411 BasicBlock *CaseBB = BasicBlock::Create(
412 Context&: Ctx, Name: Twine("MemOP.Case.") + Twine(SizeId), Parent: &Func, InsertBefore: DefaultBB);
413 MemOp NewMO = MO.clone();
414 // Fix the argument.
415 auto *SizeType = dyn_cast<IntegerType>(Val: NewMO.getLength()->getType());
416 assert(SizeType && "Expected integer type size argument.");
417 ConstantInt *CaseSizeId = ConstantInt::get(Ty: SizeType, V: SizeId);
418 NewMO.setLength(CaseSizeId);
419 NewMO.I->insertInto(ParentBB: CaseBB, It: CaseBB->end());
420 IRBuilder<> IRBCase(CaseBB);
421 IRBCase.CreateBr(Dest: MergeBB);
422 SI->addCase(OnVal: CaseSizeId, Dest: CaseBB);
423 if (!MemOpTy->isVoidTy())
424 PHI->addIncoming(V: NewMO.I, BB: CaseBB);
425 if (DT) {
426 Updates.push_back(x: {DominatorTree::Insert, CaseBB, MergeBB});
427 Updates.push_back(x: {DominatorTree::Insert, BB, CaseBB});
428 }
429 LLVM_DEBUG(dbgs() << *CaseBB << "\n");
430 }
431 DTU.applyUpdates(Updates);
432 Updates.clear();
433
434 if (MaxCount)
435 setProfMetadata(M: Func.getParent(), TI: SI, EdgeCounts: CaseCounts, MaxCount);
436
437 LLVM_DEBUG(dbgs() << *BB << "\n");
438 LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
439 LLVM_DEBUG(dbgs() << *MergeBB << "\n");
440
441 ORE.emit(RemarkBuilder: [&]() {
442 using namespace ore;
443 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
444 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
445 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
446 << " for " << NV("Versions", Version) << " versions";
447 });
448
449 return true;
450}
451} // namespace
452
453static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
454 OptimizationRemarkEmitter &ORE,
455 DominatorTree *DT, TargetLibraryInfo &TLI) {
456 if (DisableMemOPOPT)
457 return false;
458
459 if (F.hasFnAttribute(Kind: Attribute::OptimizeForSize))
460 return false;
461 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
462 MemOPSizeOpt.perform();
463 return MemOPSizeOpt.isChanged();
464}
465
466PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
467 FunctionAnalysisManager &FAM) {
468 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(IR&: F);
469 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
470 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
471 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
472 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
473 if (!Changed)
474 return PreservedAnalyses::all();
475 auto PA = PreservedAnalyses();
476 PA.preserve<DominatorTreeAnalysis>();
477 return PA;
478}
479