1//===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// Implementation file for the IRSimilarityIdentifier for identifying
11// similarities in IR including the IRInstructionMapper.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Analysis/IRSimilarityIdentifier.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/SetOperations.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/Operator.h"
20#include "llvm/IR/User.h"
21#include "llvm/InitializePasses.h"
22#include "llvm/Support/SuffixTree.h"
23
24using namespace llvm;
25using namespace IRSimilarity;
26
27namespace llvm {
28cl::opt<bool>
29 DisableBranches("no-ir-sim-branch-matching", cl::init(Val: false),
30 cl::ReallyHidden,
31 cl::desc("disable similarity matching, and outlining, "
32 "across branches for debugging purposes."));
33
34cl::opt<bool>
35 DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(Val: false),
36 cl::ReallyHidden,
37 cl::desc("disable outlining indirect calls."));
38
39static cl::opt<bool>
40 MatchCallsByName("ir-sim-calls-by-name", cl::init(Val: false), cl::ReallyHidden,
41 cl::desc("only allow matching call instructions if the "
42 "name and type signature match."));
43
44cl::opt<bool>
45 DisableIntrinsics("no-ir-sim-intrinsics", cl::init(Val: false), cl::ReallyHidden,
46 cl::desc("Don't match or outline intrinsics"));
47} // namespace llvm
48
49IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
50 IRInstructionDataList &IDList)
51 : Inst(&I), Legal(Legality), IDL(&IDList) {
52 initializeInstruction();
53}
54
55void IRInstructionData::initializeInstruction() {
56 // We check for whether we have a comparison instruction. If it is, we
57 // find the "less than" version of the predicate for consistency for
58 // comparison instructions throught the program.
59 if (CmpInst *C = dyn_cast<CmpInst>(Val: Inst)) {
60 CmpInst::Predicate Predicate = predicateForConsistency(CI: C);
61 if (Predicate != C->getPredicate())
62 RevisedPredicate = Predicate;
63 }
64
65 // Here we collect the operands and their types for determining whether
66 // the structure of the operand use matches between two different candidates.
67 for (Use &OI : Inst->operands()) {
68 if (isa<CmpInst>(Val: Inst) && RevisedPredicate) {
69 // If we have a CmpInst where the predicate is reversed, it means the
70 // operands must be reversed as well.
71 OperVals.insert(I: OperVals.begin(), Elt: OI.get());
72 continue;
73 }
74
75 OperVals.push_back(Elt: OI.get());
76 }
77
78 // We capture the incoming BasicBlocks as values as well as the incoming
79 // Values in order to check for structural similarity.
80 if (PHINode *PN = dyn_cast<PHINode>(Val: Inst))
81 llvm::append_range(C&: OperVals, R: PN->blocks());
82}
83
84IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
85 : IDL(&IDList) {}
86
87void IRInstructionData::setBranchSuccessors(
88 DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
89 assert((isa<UncondBrInst, CondBrInst>(Inst)) && "Instruction must be branch");
90
91 DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
92
93 BBNumIt = BasicBlockToInteger.find(Val: Inst->getParent());
94 assert(BBNumIt != BasicBlockToInteger.end() &&
95 "Could not find location for BasicBlock!");
96
97 int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
98
99 for (Value *V : getBlockOperVals()) {
100 BasicBlock *Successor = cast<BasicBlock>(Val: V);
101 BBNumIt = BasicBlockToInteger.find(Val: Successor);
102 assert(BBNumIt != BasicBlockToInteger.end() &&
103 "Could not find number for BasicBlock!");
104 int OtherBlockNumber = static_cast<int>(BBNumIt->second);
105
106 int Relative = OtherBlockNumber - CurrentBlockNumber;
107 RelativeBlockLocations.push_back(Elt: Relative);
108 }
109}
110
111ArrayRef<Value *> IRInstructionData::getBlockOperVals() {
112 if (isa<UncondBrInst>(Val: Inst))
113 return OperVals;
114 if (isa<CondBrInst>(Val: Inst))
115 return ArrayRef<Value *>(OperVals).drop_front(N: 1);
116
117 if (PHINode *PN = dyn_cast<PHINode>(Val: Inst))
118 return ArrayRef<Value *>(
119 std::next(x: OperVals.begin(), n: PN->getNumIncomingValues()),
120 OperVals.end()
121 );
122
123 llvm_unreachable("Instruction must be branch or PHINode");
124}
125
126void IRInstructionData::setCalleeName(bool MatchByName) {
127 CallInst *CI = dyn_cast<CallInst>(Val: Inst);
128 assert(CI && "Instruction must be call");
129
130 CalleeName = "";
131 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
132 // To hash intrinsics, we use the opcode, and types like the other
133 // instructions, but also, the Intrinsic ID, and the Name of the
134 // intrinsic.
135 Intrinsic::ID IntrinsicID = II->getIntrinsicID();
136 FunctionType *FT = II->getFunctionType();
137 // If there is an overloaded name, we have to use the complex version
138 // of getName to get the entire string.
139 if (Intrinsic::isOverloaded(id: IntrinsicID))
140 CalleeName =
141 Intrinsic::getName(Id: IntrinsicID, Tys: FT->params(), M: II->getModule(), FT);
142 // If there is not an overloaded name, we only need to use this version.
143 else
144 CalleeName = Intrinsic::getName(id: IntrinsicID).str();
145
146 return;
147 }
148
149 if (!CI->isIndirectCall() && MatchByName)
150 CalleeName = CI->getCalledFunction()->getName().str();
151}
152
153void IRInstructionData::setPHIPredecessors(
154 DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
155 assert(isa<PHINode>(Inst) && "Instruction must be phi node");
156
157 PHINode *PN = cast<PHINode>(Val: Inst);
158 DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
159
160 BBNumIt = BasicBlockToInteger.find(Val: PN->getParent());
161 assert(BBNumIt != BasicBlockToInteger.end() &&
162 "Could not find location for BasicBlock!");
163
164 int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
165
166 // Convert the incoming blocks of the PHINode to an integer value, based on
167 // the relative distances between the current block and the incoming block.
168 for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
169 BasicBlock *Incoming = PN->getIncomingBlock(i: Idx);
170 BBNumIt = BasicBlockToInteger.find(Val: Incoming);
171 assert(BBNumIt != BasicBlockToInteger.end() &&
172 "Could not find number for BasicBlock!");
173 int OtherBlockNumber = static_cast<int>(BBNumIt->second);
174
175 int Relative = OtherBlockNumber - CurrentBlockNumber;
176 RelativeBlockLocations.push_back(Elt: Relative);
177 }
178}
179
180CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
181 switch (CI->getPredicate()) {
182 case CmpInst::FCMP_OGT:
183 case CmpInst::FCMP_UGT:
184 case CmpInst::FCMP_OGE:
185 case CmpInst::FCMP_UGE:
186 case CmpInst::ICMP_SGT:
187 case CmpInst::ICMP_UGT:
188 case CmpInst::ICMP_SGE:
189 case CmpInst::ICMP_UGE:
190 return CI->getSwappedPredicate();
191 default:
192 return CI->getPredicate();
193 }
194}
195
196CmpInst::Predicate IRInstructionData::getPredicate() const {
197 assert(isa<CmpInst>(Inst) &&
198 "Can only get a predicate from a compare instruction");
199
200 if (RevisedPredicate)
201 return *RevisedPredicate;
202
203 return cast<CmpInst>(Val: Inst)->getPredicate();
204}
205
206StringRef IRInstructionData::getCalleeName() const {
207 assert(isa<CallInst>(Inst) &&
208 "Can only get a name from a call instruction");
209
210 assert(CalleeName && "CalleeName has not been set");
211
212 return *CalleeName;
213}
214
215bool IRSimilarity::isClose(const IRInstructionData &A,
216 const IRInstructionData &B) {
217
218 if (!A.Legal || !B.Legal)
219 return false;
220
221 // Check if we are performing the same sort of operation on the same types
222 // but not on the same values.
223 if (!A.Inst->isSameOperationAs(I: B.Inst)) {
224 // If there is a predicate, this means that either there is a swapped
225 // predicate, or that the types are different, we want to make sure that
226 // the predicates are equivalent via swapping.
227 if (isa<CmpInst>(Val: A.Inst) && isa<CmpInst>(Val: B.Inst)) {
228
229 if (A.getPredicate() != B.getPredicate())
230 return false;
231
232 // If the predicates are the same via swap, make sure that the types are
233 // still the same.
234 auto ZippedTypes = zip(t: A.OperVals, u: B.OperVals);
235
236 return all_of(
237 Range&: ZippedTypes, P: [](std::tuple<llvm::Value *, llvm::Value *> R) {
238 return std::get<0>(t&: R)->getType() == std::get<1>(t&: R)->getType();
239 });
240 }
241
242 return false;
243 }
244
245 // Since any GEP Instruction operands after the first operand cannot be
246 // defined by a register, we must make sure that the operands after the first
247 // are the same in the two instructions
248 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: A.Inst)) {
249 auto *OtherGEP = cast<GetElementPtrInst>(Val: B.Inst);
250
251 // If the instructions do not have the same inbounds restrictions, we do
252 // not consider them the same.
253 if (GEP->isInBounds() != OtherGEP->isInBounds())
254 return false;
255
256 auto ZippedOperands = zip(t: GEP->indices(), u: OtherGEP->indices());
257
258 // We increment here since we do not care about the first instruction,
259 // we only care about the following operands since they must be the
260 // exact same to be considered similar.
261 return all_of(Range: drop_begin(RangeOrContainer&: ZippedOperands),
262 P: [](std::tuple<llvm::Use &, llvm::Use &> R) {
263 return std::get<0>(t&: R) == std::get<1>(t&: R);
264 });
265 }
266
267 // If the instructions are functions calls, we make sure that the function
268 // name is the same. We already know that the types are since is
269 // isSameOperationAs is true.
270 if (isa<CallInst>(Val: A.Inst) && isa<CallInst>(Val: B.Inst)) {
271 if (A.getCalleeName() != B.getCalleeName())
272 return false;
273 }
274
275 if (isa<UncondBrInst, CondBrInst>(Val: A.Inst) &&
276 isa<UncondBrInst, CondBrInst>(Val: B.Inst) &&
277 A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
278 return false;
279
280 return true;
281}
282
283// TODO: This is the same as the MachineOutliner, and should be consolidated
284// into the same interface.
285void IRInstructionMapper::convertToUnsignedVec(
286 BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
287 std::vector<unsigned> &IntegerMapping) {
288 BasicBlock::iterator It = BB.begin();
289
290 std::vector<unsigned> IntegerMappingForBB;
291 std::vector<IRInstructionData *> InstrListForBB;
292
293 for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
294 switch (InstClassifier.visit(I&: *It)) {
295 case InstrType::Legal:
296 mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
297 break;
298 case InstrType::Illegal:
299 mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
300 break;
301 case InstrType::Invisible:
302 AddedIllegalLastTime = false;
303 break;
304 }
305 }
306
307 if (AddedIllegalLastTime)
308 mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, End: true);
309 for (IRInstructionData *ID : InstrListForBB)
310 this->IDL->push_back(Node&: *ID);
311 llvm::append_range(C&: InstrList, R&: InstrListForBB);
312 llvm::append_range(C&: IntegerMapping, R&: IntegerMappingForBB);
313}
314
315// TODO: This is the same as the MachineOutliner, and should be consolidated
316// into the same interface.
317unsigned IRInstructionMapper::mapToLegalUnsigned(
318 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
319 std::vector<IRInstructionData *> &InstrListForBB) {
320 // We added something legal, so we should unset the AddedLegalLastTime
321 // flag.
322 AddedIllegalLastTime = false;
323
324 // If we have at least two adjacent legal instructions (which may have
325 // invisible instructions in between), remember that.
326 if (CanCombineWithPrevInstr)
327 HaveLegalRange = true;
328 CanCombineWithPrevInstr = true;
329
330 // Get the integer for this instruction or give it the current
331 // LegalInstrNumber.
332 IRInstructionData *ID = allocateIRInstructionData(I&: *It, Legality: true, IDL&: *IDL);
333 InstrListForBB.push_back(x: ID);
334
335 if (isa<UncondBrInst, CondBrInst>(Val: *It))
336 ID->setBranchSuccessors(BasicBlockToInteger);
337
338 if (isa<CallInst>(Val: *It))
339 ID->setCalleeName(EnableMatchCallsByName);
340
341 if (isa<PHINode>(Val: *It))
342 ID->setPHIPredecessors(BasicBlockToInteger);
343
344 // Add to the instruction list
345 bool WasInserted;
346 DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
347 ResultIt;
348 std::tie(args&: ResultIt, args&: WasInserted) =
349 InstructionIntegerMap.insert(KV: std::make_pair(x&: ID, y&: LegalInstrNumber));
350 unsigned INumber = ResultIt->second;
351
352 // There was an insertion.
353 if (WasInserted)
354 LegalInstrNumber++;
355
356 IntegerMappingForBB.push_back(x: INumber);
357
358 // Make sure we don't overflow or use any integers reserved by the DenseMap.
359 assert(LegalInstrNumber < IllegalInstrNumber &&
360 "Instruction mapping overflow!");
361
362 assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
363 "Tried to assign DenseMap tombstone or empty key to instruction.");
364 assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
365 "Tried to assign DenseMap tombstone or empty key to instruction.");
366
367 return INumber;
368}
369
370IRInstructionData *
371IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
372 IRInstructionDataList &IDL) {
373 return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
374}
375
376IRInstructionData *
377IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) {
378 return new (InstDataAllocator->Allocate()) IRInstructionData(IDL);
379}
380
381IRInstructionDataList *
382IRInstructionMapper::allocateIRInstructionDataList() {
383 return new (IDLAllocator->Allocate()) IRInstructionDataList();
384}
385
386// TODO: This is the same as the MachineOutliner, and should be consolidated
387// into the same interface.
388unsigned IRInstructionMapper::mapToIllegalUnsigned(
389 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
390 std::vector<IRInstructionData *> &InstrListForBB, bool End) {
391 // Can't combine an illegal instruction. Set the flag.
392 CanCombineWithPrevInstr = false;
393
394 // Only add one illegal number per range of legal numbers.
395 if (AddedIllegalLastTime)
396 return IllegalInstrNumber;
397
398 IRInstructionData *ID = nullptr;
399 if (!End)
400 ID = allocateIRInstructionData(I&: *It, Legality: false, IDL&: *IDL);
401 else
402 ID = allocateIRInstructionData(IDL&: *IDL);
403 InstrListForBB.push_back(x: ID);
404
405 // Remember that we added an illegal number last time.
406 AddedIllegalLastTime = true;
407 unsigned INumber = IllegalInstrNumber;
408 IntegerMappingForBB.push_back(x: IllegalInstrNumber--);
409
410 assert(LegalInstrNumber < IllegalInstrNumber &&
411 "Instruction mapping overflow!");
412
413 assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
414 "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
415
416 assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
417 "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
418
419 return INumber;
420}
421
422IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
423 IRInstructionData *FirstInstIt,
424 IRInstructionData *LastInstIt)
425 : StartIdx(StartIdx), Len(Len) {
426
427 assert(FirstInstIt != nullptr && "Instruction is nullptr!");
428 assert(LastInstIt != nullptr && "Instruction is nullptr!");
429 assert(StartIdx + Len > StartIdx &&
430 "Overflow for IRSimilarityCandidate range?");
431 assert(Len - 1 == static_cast<unsigned>(std::distance(
432 iterator(FirstInstIt), iterator(LastInstIt))) &&
433 "Length of the first and last IRInstructionData do not match the "
434 "given length");
435
436 // We iterate over the given instructions, and map each unique value
437 // to a unique number in the IRSimilarityCandidate ValueToNumber and
438 // NumberToValue maps. A constant get its own value globally, the individual
439 // uses of the constants are not considered to be unique.
440 //
441 // IR: Mapping Added:
442 // %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
443 // %add2 = add i32 %a, %1 %add2 -> 4
444 // %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
445 //
446 // when replace with global values, starting from 1, would be
447 //
448 // 3 = add i32 1, 2
449 // 4 = add i32 1, 3
450 // 6 = add i32 5, 2
451 unsigned LocalValNumber = 1;
452 IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
453 for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
454 // Map the operand values to an unsigned integer if it does not already
455 // have an unsigned integer assigned to it.
456 for (Value *Arg : ID->OperVals)
457 if (ValueToNumber.try_emplace(Key: Arg, Args&: LocalValNumber).second) {
458 NumberToValue.try_emplace(Key: LocalValNumber, Args&: Arg);
459 LocalValNumber++;
460 }
461
462 // Mapping the instructions to an unsigned integer if it is not already
463 // exist in the mapping.
464 if (ValueToNumber.try_emplace(Key: ID->Inst, Args&: LocalValNumber).second) {
465 NumberToValue.try_emplace(Key: LocalValNumber, Args&: ID->Inst);
466 LocalValNumber++;
467 }
468 }
469
470 // Setting the first and last instruction data pointers for the candidate. If
471 // we got through the entire for loop without hitting an assert, we know
472 // that both of these instructions are not nullptrs.
473 FirstInst = FirstInstIt;
474 LastInst = LastInstIt;
475
476 // Add the basic blocks contained in the set into the global value numbering.
477 DenseSet<BasicBlock *> BBSet;
478 getBasicBlocks(BBSet);
479 for (BasicBlock *BB : BBSet) {
480 if (ValueToNumber.try_emplace(Key: BB, Args&: LocalValNumber).second) {
481 NumberToValue.try_emplace(Key: LocalValNumber, Args&: BB);
482 LocalValNumber++;
483 }
484 }
485}
486
487bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
488 const IRSimilarityCandidate &B) {
489 if (A.getLength() != B.getLength())
490 return false;
491
492 auto InstrDataForBoth =
493 zip(t: make_range(x: A.begin(), y: A.end()), u: make_range(x: B.begin(), y: B.end()));
494
495 return all_of(Range&: InstrDataForBoth,
496 P: [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
497 IRInstructionData &A = std::get<0>(t&: R);
498 IRInstructionData &B = std::get<1>(t&: R);
499 if (!A.Legal || !B.Legal)
500 return false;
501 return isClose(A, B);
502 });
503}
504
505/// Determine if one or more of the assigned global value numbers for the
506/// operands in \p TargetValueNumbers is in the current mapping set for operand
507/// numbers in \p SourceOperands. The set of possible corresponding global
508/// value numbers are replaced with the most recent version of compatible
509/// values.
510///
511/// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
512/// value number for the source IRInstructionCandidate.
513/// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
514/// IRSimilarityCandidate global value numbers to a set of possible numbers in
515/// the target.
516/// \param [in] SourceOperands - The operands in the original
517/// IRSimilarityCandidate in the current instruction.
518/// \param [in] TargetValueNumbers - The global value numbers of the operands in
519/// the corresponding Instruction in the other IRSimilarityCandidate.
520/// \returns true if there exists a possible mapping between the source
521/// Instruction operands and the target Instruction operands, and false if not.
522static bool checkNumberingAndReplaceCommutative(
523 const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
524 DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
525 ArrayRef<Value *> &SourceOperands,
526 DenseSet<unsigned> &TargetValueNumbers){
527
528 DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
529
530 unsigned ArgVal;
531 bool WasInserted;
532
533 // Iterate over the operands in the source IRSimilarityCandidate to determine
534 // whether there exists an operand in the other IRSimilarityCandidate that
535 // creates a valid mapping of Value to Value between the
536 // IRSimilarityCaniddates.
537 for (Value *V : SourceOperands) {
538 ArgVal = SourceValueToNumberMapping.find(Val: V)->second;
539
540 // Instead of finding a current mapping, we attempt to insert a set.
541 std::tie(args&: ValueMappingIt, args&: WasInserted) = CurrentSrcTgtNumberMapping.insert(
542 KV: std::make_pair(x&: ArgVal, y&: TargetValueNumbers));
543
544 // We need to iterate over the items in other IRSimilarityCandidate's
545 // Instruction to determine whether there is a valid mapping of
546 // Value to Value.
547 DenseSet<unsigned> NewSet;
548 for (unsigned &Curr : ValueMappingIt->second)
549 // If we can find the value in the mapping, we add it to the new set.
550 if (TargetValueNumbers.contains(V: Curr))
551 NewSet.insert(V: Curr);
552
553 // If we could not find a Value, return 0.
554 if (NewSet.empty())
555 return false;
556
557 // Otherwise replace the old mapping with the newly constructed one.
558 if (NewSet.size() != ValueMappingIt->second.size())
559 ValueMappingIt->second.swap(RHS&: NewSet);
560
561 // We have reached no conclusions about the mapping, and cannot remove
562 // any items from the other operands, so we move to check the next operand.
563 if (ValueMappingIt->second.size() != 1)
564 continue;
565
566 unsigned ValToRemove = *ValueMappingIt->second.begin();
567 // When there is only one item left in the mapping for and operand, remove
568 // the value from the other operands. If it results in there being no
569 // mapping, return false, it means the mapping is wrong
570 for (Value *InnerV : SourceOperands) {
571 if (V == InnerV)
572 continue;
573
574 unsigned InnerVal = SourceValueToNumberMapping.find(Val: InnerV)->second;
575 ValueMappingIt = CurrentSrcTgtNumberMapping.find(Val: InnerVal);
576 if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
577 continue;
578
579 ValueMappingIt->second.erase(V: ValToRemove);
580 if (ValueMappingIt->second.empty())
581 return false;
582 }
583 }
584
585 return true;
586}
587
588/// Determine if operand number \p TargetArgVal is in the current mapping set
589/// for operand number \p SourceArgVal.
590///
591/// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
592/// value numbers from source IRSimilarityCandidate to target
593/// IRSimilarityCandidate.
594/// \param [in] SourceArgVal The global value number for an operand in the
595/// in the original candidate.
596/// \param [in] TargetArgVal The global value number for the corresponding
597/// operand in the other candidate.
598/// \returns True if there exists a mapping and false if not.
599bool checkNumberingAndReplace(
600 DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
601 unsigned SourceArgVal, unsigned TargetArgVal) {
602 // We are given two unsigned integers representing the global values of
603 // the operands in different IRSimilarityCandidates and a current mapping
604 // between the two.
605 //
606 // Source Operand GVN: 1
607 // Target Operand GVN: 2
608 // CurrentMapping: {1: {1, 2}}
609 //
610 // Since we have mapping, and the target operand is contained in the set, we
611 // update it to:
612 // CurrentMapping: {1: {2}}
613 // and can return true. But, if the mapping was
614 // CurrentMapping: {1: {3}}
615 // we would return false.
616
617 bool WasInserted;
618 DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
619
620 std::tie(args&: Val, args&: WasInserted) = CurrentSrcTgtNumberMapping.insert(
621 KV: std::make_pair(x&: SourceArgVal, y: DenseSet<unsigned>({TargetArgVal})));
622
623 // If we created a new mapping, then we are done.
624 if (WasInserted)
625 return true;
626
627 // If there is more than one option in the mapping set, and the target value
628 // is included in the mapping set replace that set with one that only includes
629 // the target value, as it is the only valid mapping via the non commutative
630 // instruction.
631
632 DenseSet<unsigned> &TargetSet = Val->second;
633 if (TargetSet.size() > 1 && TargetSet.contains(V: TargetArgVal)) {
634 TargetSet.clear();
635 TargetSet.insert(V: TargetArgVal);
636 return true;
637 }
638
639 // Return true if we can find the value in the set.
640 return TargetSet.contains(V: TargetArgVal);
641}
642
643bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
644 OperandMapping A, OperandMapping B) {
645 // Iterators to keep track of where we are in the operands for each
646 // Instruction.
647 ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
648 ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
649 unsigned OperandLength = A.OperVals.size();
650
651 // For each operand, get the value numbering and ensure it is consistent.
652 for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
653 unsigned OperValA = A.IRSC.ValueToNumber.find(Val: *VItA)->second;
654 unsigned OperValB = B.IRSC.ValueToNumber.find(Val: *VItB)->second;
655
656 // Attempt to add a set with only the target value. If there is no mapping
657 // we can create it here.
658 //
659 // For an instruction like a subtraction:
660 // IRSimilarityCandidateA: IRSimilarityCandidateB:
661 // %resultA = sub %a, %b %resultB = sub %d, %e
662 //
663 // We map %a -> %d and %b -> %e.
664 //
665 // And check to see whether their mapping is consistent in
666 // checkNumberingAndReplace.
667
668 if (!checkNumberingAndReplace(CurrentSrcTgtNumberMapping&: A.ValueNumberMapping, SourceArgVal: OperValA, TargetArgVal: OperValB))
669 return false;
670
671 if (!checkNumberingAndReplace(CurrentSrcTgtNumberMapping&: B.ValueNumberMapping, SourceArgVal: OperValB, TargetArgVal: OperValA))
672 return false;
673 }
674 return true;
675}
676
677bool IRSimilarityCandidate::compareCommutativeOperandMapping(
678 OperandMapping A, OperandMapping B) {
679 DenseSet<unsigned> ValueNumbersA;
680 DenseSet<unsigned> ValueNumbersB;
681
682 ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
683 ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
684 unsigned OperandLength = A.OperVals.size();
685
686 // Find the value number sets for the operands.
687 for (unsigned Idx = 0; Idx < OperandLength;
688 Idx++, VItA++, VItB++) {
689 ValueNumbersA.insert(V: A.IRSC.ValueToNumber.find(Val: *VItA)->second);
690 ValueNumbersB.insert(V: B.IRSC.ValueToNumber.find(Val: *VItB)->second);
691 }
692
693 // Iterate over the operands in the first IRSimilarityCandidate and make sure
694 // there exists a possible mapping with the operands in the second
695 // IRSimilarityCandidate.
696 if (!checkNumberingAndReplaceCommutative(SourceValueToNumberMapping: A.IRSC.ValueToNumber,
697 CurrentSrcTgtNumberMapping&: A.ValueNumberMapping, SourceOperands&: A.OperVals,
698 TargetValueNumbers&: ValueNumbersB))
699 return false;
700
701 // Iterate over the operands in the second IRSimilarityCandidate and make sure
702 // there exists a possible mapping with the operands in the first
703 // IRSimilarityCandidate.
704 if (!checkNumberingAndReplaceCommutative(SourceValueToNumberMapping: B.IRSC.ValueToNumber,
705 CurrentSrcTgtNumberMapping&: B.ValueNumberMapping, SourceOperands&: B.OperVals,
706 TargetValueNumbers&: ValueNumbersA))
707 return false;
708
709 return true;
710}
711
712bool IRSimilarityCandidate::compareAssignmentMapping(
713 const unsigned InstValA, const unsigned &InstValB,
714 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
715 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
716 DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
717 bool WasInserted;
718 std::tie(args&: ValueMappingIt, args&: WasInserted) = ValueNumberMappingA.insert(
719 KV: std::make_pair(x: InstValA, y: DenseSet<unsigned>({InstValB})));
720 if (!WasInserted && !ValueMappingIt->second.contains(V: InstValB))
721 return false;
722 else if (ValueMappingIt->second.size() != 1) {
723 for (unsigned OtherVal : ValueMappingIt->second) {
724 if (OtherVal == InstValB)
725 continue;
726 auto OtherValIt = ValueNumberMappingA.find(Val: OtherVal);
727 if (OtherValIt == ValueNumberMappingA.end())
728 continue;
729 OtherValIt->second.erase(V: InstValA);
730 }
731 ValueNumberMappingA.erase(I: ValueMappingIt);
732 std::tie(args&: ValueMappingIt, args&: WasInserted) = ValueNumberMappingA.insert(
733 KV: std::make_pair(x: InstValA, y: DenseSet<unsigned>({InstValB})));
734 }
735
736 return true;
737}
738
739bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A,
740 RelativeLocMapping B) {
741 // Get the basic blocks the label refers to.
742 BasicBlock *ABB = cast<BasicBlock>(Val: A.OperVal);
743 BasicBlock *BBB = cast<BasicBlock>(Val: B.OperVal);
744
745 // Get the basic blocks contained in each region.
746 DenseSet<BasicBlock *> BasicBlockA;
747 DenseSet<BasicBlock *> BasicBlockB;
748 A.IRSC.getBasicBlocks(BBSet&: BasicBlockA);
749 B.IRSC.getBasicBlocks(BBSet&: BasicBlockB);
750
751 // Determine if the block is contained in the region.
752 bool AContained = BasicBlockA.contains(V: ABB);
753 bool BContained = BasicBlockB.contains(V: BBB);
754
755 // Both blocks need to be contained in the region, or both need to be outside
756 // the region.
757 if (AContained != BContained)
758 return false;
759
760 // If both are contained, then we need to make sure that the relative
761 // distance to the target blocks are the same.
762 if (AContained)
763 return A.RelativeLocation == B.RelativeLocation;
764 return true;
765}
766
767bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
768 const IRSimilarityCandidate &B) {
769 DenseMap<unsigned, DenseSet<unsigned>> MappingA;
770 DenseMap<unsigned, DenseSet<unsigned>> MappingB;
771 return IRSimilarityCandidate::compareStructure(A, B, ValueNumberMappingA&: MappingA, ValueNumberMappingB&: MappingB);
772}
773
774typedef detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
775 SmallVector<int, 4> &, ArrayRef<Value *> &,
776 ArrayRef<Value *> &>
777 ZippedRelativeLocationsT;
778
779bool IRSimilarityCandidate::compareStructure(
780 const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
781 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
782 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
783 if (A.getLength() != B.getLength())
784 return false;
785
786 if (A.ValueToNumber.size() != B.ValueToNumber.size())
787 return false;
788
789 iterator ItA = A.begin();
790 iterator ItB = B.begin();
791
792 // These ValueNumber Mapping sets create a create a mapping between the values
793 // in one candidate to values in the other candidate. If we create a set with
794 // one element, and that same element maps to the original element in the
795 // candidate we have a good mapping.
796
797 // Iterate over the instructions contained in each candidate
798 unsigned SectionLength = A.getStartIdx() + A.getLength();
799 for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
800 ItA++, ItB++, Loc++) {
801 // Make sure the instructions are similar to one another.
802 if (!isClose(A: *ItA, B: *ItB))
803 return false;
804
805 Instruction *IA = ItA->Inst;
806 Instruction *IB = ItB->Inst;
807
808 if (!ItA->Legal || !ItB->Legal)
809 return false;
810
811 // Get the operand sets for the instructions.
812 ArrayRef<Value *> OperValsA = ItA->OperVals;
813 ArrayRef<Value *> OperValsB = ItB->OperVals;
814
815 unsigned InstValA = A.ValueToNumber.find(Val: IA)->second;
816 unsigned InstValB = B.ValueToNumber.find(Val: IB)->second;
817
818 // Ensure that the mappings for the instructions exists.
819 if (!compareAssignmentMapping(InstValA, InstValB, ValueNumberMappingA,
820 ValueNumberMappingB))
821 return false;
822
823 if (!compareAssignmentMapping(InstValA: InstValB, InstValB: InstValA, ValueNumberMappingA&: ValueNumberMappingB,
824 ValueNumberMappingB&: ValueNumberMappingA))
825 return false;
826
827 // We have different paths for commutative instructions and non-commutative
828 // instructions since commutative instructions could allow multiple mappings
829 // to certain values.
830 if (IA->isCommutative() && !isa<FPMathOperator>(Val: IA) &&
831 !isa<IntrinsicInst>(Val: IA)) {
832 if (!compareCommutativeOperandMapping(
833 A: {.IRSC: A, .OperVals: OperValsA, .ValueNumberMapping: ValueNumberMappingA},
834 B: {.IRSC: B, .OperVals: OperValsB, .ValueNumberMapping: ValueNumberMappingB}))
835 return false;
836 continue;
837 }
838
839 // Handle the non-commutative cases.
840 if (!compareNonCommutativeOperandMapping(
841 A: {.IRSC: A, .OperVals: OperValsA, .ValueNumberMapping: ValueNumberMappingA},
842 B: {.IRSC: B, .OperVals: OperValsB, .ValueNumberMapping: ValueNumberMappingB}))
843 return false;
844
845 // Here we check that between two corresponding instructions,
846 // when referring to a basic block in the same region, the
847 // relative locations are the same. And, that the instructions refer to
848 // basic blocks outside the region in the same corresponding locations.
849
850 // We are able to make the assumption about blocks outside of the region
851 // since the target block labels are considered values and will follow the
852 // same number matching that we defined for the other instructions in the
853 // region. So, at this point, in each location we target a specific block
854 // outside the region, we are targeting a corresponding block in each
855 // analagous location in the region we are comparing to.
856 if (!isa<UncondBrInst, CondBrInst, PHINode>(Val: IA) ||
857 IA->getOpcode() != IB->getOpcode())
858 continue;
859
860 SmallVector<int, 4> &RelBlockLocsA = ItA->RelativeBlockLocations;
861 SmallVector<int, 4> &RelBlockLocsB = ItB->RelativeBlockLocations;
862 ArrayRef<Value *> ABL = ItA->getBlockOperVals();
863 ArrayRef<Value *> BBL = ItB->getBlockOperVals();
864
865 // Check to make sure that the number of operands, and branching locations
866 // between BranchInsts is the same.
867 if (RelBlockLocsA.size() != RelBlockLocsB.size() &&
868 ABL.size() != BBL.size())
869 return false;
870
871 assert(RelBlockLocsA.size() == ABL.size() &&
872 "Block information vectors not the same size.");
873 assert(RelBlockLocsB.size() == BBL.size() &&
874 "Block information vectors not the same size.");
875
876 ZippedRelativeLocationsT ZippedRelativeLocations =
877 zip(t&: RelBlockLocsA, u&: RelBlockLocsB, args&: ABL, args&: BBL);
878 if (any_of(Range&: ZippedRelativeLocations,
879 P: [&A, &B](std::tuple<int, int, Value *, Value *> R) {
880 return !checkRelativeLocations(
881 A: {.IRSC: A, .RelativeLocation: std::get<0>(t&: R), .OperVal: std::get<2>(t&: R)},
882 B: {.IRSC: B, .RelativeLocation: std::get<1>(t&: R), .OperVal: std::get<3>(t&: R)});
883 }))
884 return false;
885 }
886 return true;
887}
888
889bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
890 const IRSimilarityCandidate &B) {
891 auto DoesOverlap = [](const IRSimilarityCandidate &X,
892 const IRSimilarityCandidate &Y) {
893 // Check:
894 // XXXXXX X starts before Y ends
895 // YYYYYYY Y starts after X starts
896 return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
897 };
898
899 return DoesOverlap(A, B) || DoesOverlap(B, A);
900}
901
902void IRSimilarityIdentifier::populateMapper(
903 Module &M, std::vector<IRInstructionData *> &InstrList,
904 std::vector<unsigned> &IntegerMapping) {
905
906 std::vector<IRInstructionData *> InstrListForModule;
907 std::vector<unsigned> IntegerMappingForModule;
908 // Iterate over the functions in the module to map each Instruction in each
909 // BasicBlock to an unsigned integer.
910 Mapper.initializeForBBs(M);
911
912 for (Function &F : M) {
913
914 if (F.empty())
915 continue;
916
917 for (BasicBlock &BB : F) {
918
919 // BB has potential to have similarity since it has a size greater than 2
920 // and can therefore match other regions greater than 2. Map it to a list
921 // of unsigned integers.
922 Mapper.convertToUnsignedVec(BB, InstrList&: InstrListForModule,
923 IntegerMapping&: IntegerMappingForModule);
924 }
925
926 BasicBlock::iterator It = F.begin()->end();
927 Mapper.mapToIllegalUnsigned(It, IntegerMappingForBB&: IntegerMappingForModule, InstrListForBB&: InstrListForModule,
928 End: true);
929 if (InstrListForModule.size() > 0)
930 Mapper.IDL->push_back(Node&: *InstrListForModule.back());
931 }
932
933 // Insert the InstrListForModule at the end of the overall InstrList so that
934 // we can have a long InstrList for the entire set of Modules being analyzed.
935 llvm::append_range(C&: InstrList, R&: InstrListForModule);
936 // Do the same as above, but for IntegerMapping.
937 llvm::append_range(C&: IntegerMapping, R&: IntegerMappingForModule);
938}
939
940void IRSimilarityIdentifier::populateMapper(
941 ArrayRef<std::unique_ptr<Module>> &Modules,
942 std::vector<IRInstructionData *> &InstrList,
943 std::vector<unsigned> &IntegerMapping) {
944
945 // Iterate over, and map the instructions in each module.
946 for (const std::unique_ptr<Module> &M : Modules)
947 populateMapper(M&: *M, InstrList, IntegerMapping);
948}
949
950/// From a repeated subsequence, find all the different instances of the
951/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
952/// the IRInstructionData in subsequence.
953///
954/// \param [in] Mapper - The instruction mapper for basic correctness checks.
955/// \param [in] InstrList - The vector that holds the instruction data.
956/// \param [in] IntegerMapping - The vector that holds the mapped integers.
957/// \param [out] CandsForRepSubstring - The vector to store the generated
958/// IRSimilarityCandidates.
959static void createCandidatesFromSuffixTree(
960 const IRInstructionMapper& Mapper, std::vector<IRInstructionData *> &InstrList,
961 std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
962 std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
963
964 unsigned StringLen = RS.Length;
965 if (StringLen < 2)
966 return;
967
968 // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
969 for (const unsigned &StartIdx : RS.StartIndices) {
970 unsigned EndIdx = StartIdx + StringLen - 1;
971
972 // Check that this subsequence does not contain an illegal instruction.
973 bool ContainsIllegal = false;
974 for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
975 unsigned Key = IntegerMapping[CurrIdx];
976 if (Key > Mapper.IllegalInstrNumber) {
977 ContainsIllegal = true;
978 break;
979 }
980 }
981
982 // If we have an illegal instruction, we should not create an
983 // IRSimilarityCandidate for this region.
984 if (ContainsIllegal)
985 continue;
986
987 // We are getting iterators to the instructions in this region of code
988 // by advancing the start and end indices from the start of the
989 // InstrList.
990 std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
991 std::advance(i&: StartIt, n: StartIdx);
992 std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
993 std::advance(i&: EndIt, n: EndIdx);
994
995 CandsForRepSubstring.emplace_back(args: StartIdx, args&: StringLen, args&: *StartIt, args&: *EndIt);
996 }
997}
998
999void IRSimilarityCandidate::createCanonicalRelationFrom(
1000 IRSimilarityCandidate &SourceCand,
1001 DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
1002 DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
1003 assert(SourceCand.CanonNumToNumber.size() != 0 &&
1004 "Base canonical relationship is empty!");
1005 assert(SourceCand.NumberToCanonNum.size() != 0 &&
1006 "Base canonical relationship is empty!");
1007
1008 assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
1009 assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
1010
1011 DenseSet<unsigned> UsedGVNs;
1012 // Iterate over the mappings provided from this candidate to SourceCand. We
1013 // are then able to map the GVN in this candidate to the same canonical number
1014 // given to the corresponding GVN in SourceCand.
1015 for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
1016 unsigned SourceGVN = GVNMapping.first;
1017
1018 assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
1019
1020 unsigned ResultGVN;
1021 // We need special handling if we have more than one potential value. This
1022 // means that there are at least two GVNs that could correspond to this GVN.
1023 // This could lead to potential swapping later on, so we make a decision
1024 // here to ensure a one-to-one mapping.
1025 if (GVNMapping.second.size() > 1) {
1026 bool Found = false;
1027 for (unsigned Val : GVNMapping.second) {
1028 // We make sure the target value number hasn't already been reserved.
1029 if (UsedGVNs.contains(V: Val))
1030 continue;
1031
1032 // We make sure that the opposite mapping is still consistent.
1033 DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
1034 FromSourceMapping.find(Val);
1035
1036 if (!It->second.contains(V: SourceGVN))
1037 continue;
1038
1039 // We pick the first item that satisfies these conditions.
1040 Found = true;
1041 ResultGVN = Val;
1042 break;
1043 }
1044
1045 assert(Found && "Could not find matching value for source GVN");
1046 (void)Found;
1047
1048 } else
1049 ResultGVN = *GVNMapping.second.begin();
1050
1051 // Whatever GVN is found, we mark it as used.
1052 UsedGVNs.insert(V: ResultGVN);
1053
1054 unsigned CanonNum = *SourceCand.getCanonicalNum(N: ResultGVN);
1055 CanonNumToNumber.insert(KV: std::make_pair(x&: CanonNum, y&: SourceGVN));
1056 NumberToCanonNum.insert(KV: std::make_pair(x&: SourceGVN, y&: CanonNum));
1057 }
1058
1059 DenseSet<BasicBlock *> BBSet;
1060 getBasicBlocks(BBSet);
1061 // Find canonical numbers for the BasicBlocks in the current candidate.
1062 // This is done by finding the corresponding value for the first instruction
1063 // in the block in the current candidate, finding the matching value in the
1064 // source candidate. Then by finding the parent of this value, use the
1065 // canonical number of the block in the source candidate for the canonical
1066 // number in the current candidate.
1067 for (BasicBlock *BB : BBSet) {
1068 unsigned BBGVNForCurrCand = ValueToNumber.find(Val: BB)->second;
1069
1070 // We can skip the BasicBlock if the canonical numbering has already been
1071 // found in a separate instruction.
1072 if (NumberToCanonNum.contains(Val: BBGVNForCurrCand))
1073 continue;
1074
1075 // If the basic block is the starting block, then the shared instruction may
1076 // not be the first instruction in the block, it will be the first
1077 // instruction in the similarity region.
1078 Value *FirstOutlineInst = BB == getStartBB()
1079 ? frontInstruction()
1080 : &*BB->instructionsWithoutDebug().begin();
1081
1082 unsigned FirstInstGVN = *getGVN(V: FirstOutlineInst);
1083 unsigned FirstInstCanonNum = *getCanonicalNum(N: FirstInstGVN);
1084 unsigned SourceGVN = *SourceCand.fromCanonicalNum(N: FirstInstCanonNum);
1085 Value *SourceV = *SourceCand.fromGVN(Num: SourceGVN);
1086 BasicBlock *SourceBB = cast<Instruction>(Val: SourceV)->getParent();
1087 unsigned SourceBBGVN = *SourceCand.getGVN(V: SourceBB);
1088 unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(N: SourceBBGVN);
1089 CanonNumToNumber.insert(KV: std::make_pair(x&: SourceCanonBBGVN, y&: BBGVNForCurrCand));
1090 NumberToCanonNum.insert(KV: std::make_pair(x&: BBGVNForCurrCand, y&: SourceCanonBBGVN));
1091 }
1092}
1093
1094void IRSimilarityCandidate::createCanonicalRelationFrom(
1095 IRSimilarityCandidate &SourceCand, IRSimilarityCandidate &SourceCandLarge,
1096 IRSimilarityCandidate &TargetCandLarge) {
1097 assert(!SourceCand.CanonNumToNumber.empty() &&
1098 "Canonical Relationship is non-empty");
1099 assert(!SourceCand.NumberToCanonNum.empty() &&
1100 "Canonical Relationship is non-empty");
1101
1102 assert(!SourceCandLarge.CanonNumToNumber.empty() &&
1103 "Canonical Relationship is non-empty");
1104 assert(!SourceCandLarge.NumberToCanonNum.empty() &&
1105 "Canonical Relationship is non-empty");
1106
1107 assert(!TargetCandLarge.CanonNumToNumber.empty() &&
1108 "Canonical Relationship is non-empty");
1109 assert(!TargetCandLarge.NumberToCanonNum.empty() &&
1110 "Canonical Relationship is non-empty");
1111
1112 assert(CanonNumToNumber.empty() && "Canonical Relationship is non-empty");
1113 assert(NumberToCanonNum.empty() && "Canonical Relationship is non-empty");
1114
1115 // We're going to use the larger candidates as a "bridge" to create the
1116 // canonical number for the target candidate since we have idetified two
1117 // candidates as subsequences of larger sequences, and therefore must be
1118 // structurally similar.
1119 for (std::pair<Value *, unsigned> &ValueNumPair : ValueToNumber) {
1120 Value *CurrVal = ValueNumPair.first;
1121 unsigned TargetCandGVN = ValueNumPair.second;
1122
1123 // Find the numbering in the large candidate that surrounds the
1124 // current candidate.
1125 std::optional<unsigned> OLargeTargetGVN = TargetCandLarge.getGVN(V: CurrVal);
1126 assert(OLargeTargetGVN.has_value() && "GVN not found for Value");
1127
1128 // Get the canonical numbering in the large target candidate.
1129 std::optional<unsigned> OTargetCandCanon =
1130 TargetCandLarge.getCanonicalNum(N: OLargeTargetGVN.value());
1131 assert(OTargetCandCanon.has_value() &&
1132 "Canononical Number not found for GVN");
1133
1134 // Get the GVN in the large source candidate from the canonical numbering.
1135 std::optional<unsigned> OLargeSourceGVN =
1136 SourceCandLarge.fromCanonicalNum(N: OTargetCandCanon.value());
1137 assert(OLargeSourceGVN.has_value() &&
1138 "GVN Number not found for Canonical Number");
1139
1140 // Get the Value from the GVN in the large source candidate.
1141 std::optional<Value *> OLargeSourceV =
1142 SourceCandLarge.fromGVN(Num: OLargeSourceGVN.value());
1143 assert(OLargeSourceV.has_value() && "Value not found for GVN");
1144
1145 // Get the GVN number for the Value in the source candidate.
1146 std::optional<unsigned> OSourceGVN =
1147 SourceCand.getGVN(V: OLargeSourceV.value());
1148 assert(OSourceGVN.has_value() && "GVN Number not found for Value");
1149
1150 // Get the canonical numbering from the GVN/
1151 std::optional<unsigned> OSourceCanon =
1152 SourceCand.getCanonicalNum(N: OSourceGVN.value());
1153 assert(OSourceCanon.has_value() && "Canon Number not found for GVN");
1154
1155 // Insert the canonical numbering and GVN pair into their respective
1156 // mappings.
1157 CanonNumToNumber.insert(
1158 KV: std::make_pair(x&: OSourceCanon.value(), y&: TargetCandGVN));
1159 NumberToCanonNum.insert(
1160 KV: std::make_pair(x&: TargetCandGVN, y&: OSourceCanon.value()));
1161 }
1162}
1163
1164void IRSimilarityCandidate::createCanonicalMappingFor(
1165 IRSimilarityCandidate &CurrCand) {
1166 assert(CurrCand.CanonNumToNumber.size() == 0 &&
1167 "Canonical Relationship is non-empty");
1168 assert(CurrCand.NumberToCanonNum.size() == 0 &&
1169 "Canonical Relationship is non-empty");
1170
1171 unsigned CanonNum = 0;
1172 // Iterate over the value numbers found, the order does not matter in this
1173 // case.
1174 for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
1175 CurrCand.NumberToCanonNum.insert(KV: std::make_pair(x&: NumToVal.first, y&: CanonNum));
1176 CurrCand.CanonNumToNumber.insert(KV: std::make_pair(x&: CanonNum, y&: NumToVal.first));
1177 CanonNum++;
1178 }
1179}
1180
1181/// Look for larger IRSimilarityCandidates From the previously matched
1182/// IRSimilarityCandidates that fully contain \p CandA or \p CandB. If there is
1183/// an overlap, return a pair of structurally similar, larger
1184/// IRSimilarityCandidates.
1185///
1186/// \param [in] CandA - The first candidate we are trying to determine the
1187/// structure of.
1188/// \param [in] CandB - The second candidate we are trying to determine the
1189/// structure of.
1190/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1191/// a circuit to the IRSimilarityCandidates that include this instruction.
1192/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1193/// number representing the structural group assigned to it.
1194static std::optional<
1195 std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1196CheckLargerCands(
1197 IRSimilarityCandidate &CandA, IRSimilarityCandidate &CandB,
1198 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1199 DenseMap<IRSimilarityCandidate *, unsigned> &CandToGroup) {
1200 DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandA;
1201 DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandB;
1202 DenseSet<unsigned> IncludedGroupsA;
1203 DenseSet<unsigned> IncludedGroupsB;
1204
1205 // Find the overall similarity group numbers that fully contain the candidate,
1206 // and record the larger candidate for each group.
1207 auto IdxToCandidateIt = IndexToIncludedCand.find(Val: CandA.getStartIdx());
1208 std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1209 Result;
1210
1211 unsigned CandAStart = CandA.getStartIdx();
1212 unsigned CandAEnd = CandA.getEndIdx();
1213 unsigned CandBStart = CandB.getStartIdx();
1214 unsigned CandBEnd = CandB.getEndIdx();
1215 if (IdxToCandidateIt == IndexToIncludedCand.end())
1216 return Result;
1217 for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1218 if (MatchedCand->getStartIdx() > CandAStart ||
1219 (MatchedCand->getEndIdx() < CandAEnd))
1220 continue;
1221 unsigned GroupNum = CandToGroup.find(Val: MatchedCand)->second;
1222 IncludedGroupAndCandA.insert(KV: std::make_pair(x&: GroupNum, y&: MatchedCand));
1223 IncludedGroupsA.insert(V: GroupNum);
1224 }
1225
1226 // Find the overall similarity group numbers that fully contain the next
1227 // candidate, and record the larger candidate for each group.
1228 IdxToCandidateIt = IndexToIncludedCand.find(Val: CandBStart);
1229 if (IdxToCandidateIt == IndexToIncludedCand.end())
1230 return Result;
1231 for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1232 if (MatchedCand->getStartIdx() > CandBStart ||
1233 MatchedCand->getEndIdx() < CandBEnd)
1234 continue;
1235 unsigned GroupNum = CandToGroup.find(Val: MatchedCand)->second;
1236 IncludedGroupAndCandB.insert(KV: std::make_pair(x&: GroupNum, y&: MatchedCand));
1237 IncludedGroupsB.insert(V: GroupNum);
1238 }
1239
1240 // Find the intersection between the two groups, these are the groups where
1241 // the larger candidates exist.
1242 set_intersect(S1&: IncludedGroupsA, S2: IncludedGroupsB);
1243
1244 // If there is no intersection between the sets, then we cannot determine
1245 // whether or not there is a match.
1246 if (IncludedGroupsA.empty())
1247 return Result;
1248
1249 // Create a pair that contains the larger candidates.
1250 auto ItA = IncludedGroupAndCandA.find(Val: *IncludedGroupsA.begin());
1251 auto ItB = IncludedGroupAndCandB.find(Val: *IncludedGroupsA.begin());
1252 Result = std::make_pair(x&: ItA->second, y&: ItB->second);
1253 return Result;
1254}
1255
1256/// From the list of IRSimilarityCandidates, perform a comparison between each
1257/// IRSimilarityCandidate to determine if there are overlapping
1258/// IRInstructionData, or if they do not have the same structure.
1259///
1260/// \param [in] CandsForRepSubstring - The vector containing the
1261/// IRSimilarityCandidates.
1262/// \param [out] StructuralGroups - the mapping of unsigned integers to vector
1263/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
1264/// vector are structurally similar to one another.
1265/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1266/// a circuit to the IRSimilarityCandidates that include this instruction.
1267/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1268/// number representing the structural group assigned to it.
1269static void findCandidateStructures(
1270 std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
1271 DenseMap<unsigned, SimilarityGroup> &StructuralGroups,
1272 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1273 DenseMap<IRSimilarityCandidate *, unsigned> &CandToOverallGroup
1274 ) {
1275 std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
1276 InnerCandEndIt;
1277
1278 // IRSimilarityCandidates each have a structure for operand use. It is
1279 // possible that two instances of the same subsequences have different
1280 // structure. Each type of structure found is assigned a number. This
1281 // DenseMap maps an IRSimilarityCandidate to which type of similarity
1282 // discovered it fits within.
1283 DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1284
1285 // Find the compatibility from each candidate to the others to determine
1286 // which candidates overlap and which have the same structure by mapping
1287 // each structure to a different group.
1288 bool SameStructure;
1289 bool Inserted;
1290 unsigned CurrentGroupNum = 0;
1291 unsigned OuterGroupNum;
1292 DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
1293 DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
1294 DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
1295
1296 // Iterate over the candidates to determine its structural and overlapping
1297 // compatibility with other instructions
1298 DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
1299 DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
1300 for (CandIt = CandsForRepSubstring.begin(),
1301 CandEndIt = CandsForRepSubstring.end();
1302 CandIt != CandEndIt; CandIt++) {
1303
1304 // Determine if it has an assigned structural group already.
1305 // If not, we assign it one, and add it to our mapping.
1306 std::tie(args&: CandToGroupIt, args&: Inserted) =
1307 CandToGroup.try_emplace(Key: &*CandIt, Args&: CurrentGroupNum);
1308 if (Inserted)
1309 ++CurrentGroupNum;
1310
1311 // Get the structural group number from the iterator.
1312 OuterGroupNum = CandToGroupIt->second;
1313
1314 // Check if we already have a list of IRSimilarityCandidates for the current
1315 // structural group. Create one if one does not exist.
1316 CurrentGroupPair = StructuralGroups.find(Val: OuterGroupNum);
1317 if (CurrentGroupPair == StructuralGroups.end()) {
1318 IRSimilarityCandidate::createCanonicalMappingFor(CurrCand&: *CandIt);
1319 std::tie(args&: CurrentGroupPair, args&: Inserted) = StructuralGroups.insert(
1320 KV: std::make_pair(x&: OuterGroupNum, y: SimilarityGroup({*CandIt})));
1321 }
1322
1323 // Iterate over the IRSimilarityCandidates following the current
1324 // IRSimilarityCandidate in the list to determine whether the two
1325 // IRSimilarityCandidates are compatible. This is so we do not repeat pairs
1326 // of IRSimilarityCandidates.
1327 for (InnerCandIt = std::next(x: CandIt),
1328 InnerCandEndIt = CandsForRepSubstring.end();
1329 InnerCandIt != InnerCandEndIt; InnerCandIt++) {
1330
1331 // We check if the inner item has a group already, if it does, we skip it.
1332 CandToGroupItInner = CandToGroup.find(Val: &*InnerCandIt);
1333 if (CandToGroupItInner != CandToGroup.end())
1334 continue;
1335
1336 // Check if we have found structural similarity between two candidates
1337 // that fully contains the first and second candidates.
1338 std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1339 LargerPair = CheckLargerCands(
1340 CandA&: *CandIt, CandB&: *InnerCandIt, IndexToIncludedCand, CandToGroup&: CandToOverallGroup);
1341
1342 // If a pair was found, it means that we can assume that these smaller
1343 // substrings are also structurally similar. Use the larger candidates to
1344 // determine the canonical mapping between the two sections.
1345 if (LargerPair.has_value()) {
1346 SameStructure = true;
1347 InnerCandIt->createCanonicalRelationFrom(
1348 SourceCand&: *CandIt, SourceCandLarge&: *LargerPair.value().first, TargetCandLarge&: *LargerPair.value().second);
1349 CandToGroup.insert(KV: std::make_pair(x: &*InnerCandIt, y&: OuterGroupNum));
1350 CurrentGroupPair->second.push_back(x: *InnerCandIt);
1351 continue;
1352 }
1353
1354 // Otherwise we determine if they have the same structure and add it to
1355 // vector if they match.
1356 ValueNumberMappingA.clear();
1357 ValueNumberMappingB.clear();
1358 SameStructure = IRSimilarityCandidate::compareStructure(
1359 A: *CandIt, B: *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
1360 if (!SameStructure)
1361 continue;
1362
1363 InnerCandIt->createCanonicalRelationFrom(SourceCand&: *CandIt, ToSourceMapping&: ValueNumberMappingA,
1364 FromSourceMapping&: ValueNumberMappingB);
1365 CandToGroup.insert(KV: std::make_pair(x: &*InnerCandIt, y&: OuterGroupNum));
1366 CurrentGroupPair->second.push_back(x: *InnerCandIt);
1367 }
1368 }
1369}
1370
1371void IRSimilarityIdentifier::findCandidates(
1372 std::vector<IRInstructionData *> &InstrList,
1373 std::vector<unsigned> &IntegerMapping) {
1374 SuffixTree ST(IntegerMapping);
1375
1376 std::vector<IRSimilarityCandidate> CandsForRepSubstring;
1377 std::vector<SimilarityGroup> NewCandidateGroups;
1378
1379 DenseMap<unsigned, SimilarityGroup> StructuralGroups;
1380 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> IndexToIncludedCand;
1381 DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1382
1383 // Iterate over the subsequences found by the Suffix Tree to create
1384 // IRSimilarityCandidates for each repeated subsequence and determine which
1385 // instances are structurally similar to one another.
1386
1387 // Sort the suffix tree from longest substring to shortest.
1388 std::vector<SuffixTree::RepeatedSubstring> RSes;
1389 for (SuffixTree::RepeatedSubstring &RS : ST)
1390 RSes.push_back(x: RS);
1391
1392 llvm::stable_sort(Range&: RSes, C: [](const SuffixTree::RepeatedSubstring &LHS,
1393 const SuffixTree::RepeatedSubstring &RHS) {
1394 return LHS.Length > RHS.Length;
1395 });
1396 for (SuffixTree::RepeatedSubstring &RS : RSes) {
1397 createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS,
1398 CandsForRepSubstring);
1399
1400 if (CandsForRepSubstring.size() < 2)
1401 continue;
1402
1403 findCandidateStructures(CandsForRepSubstring, StructuralGroups,
1404 IndexToIncludedCand, CandToOverallGroup&: CandToGroup);
1405 for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups) {
1406 // We only add the group if it contains more than one
1407 // IRSimilarityCandidate. If there is only one, that means there is no
1408 // other repeated subsequence with the same structure.
1409 if (Group.second.size() > 1) {
1410 SimilarityCandidates->push_back(x: Group.second);
1411 // Iterate over each candidate in the group, and add an entry for each
1412 // instruction included with a mapping to a set of
1413 // IRSimilarityCandidates that include that instruction.
1414 for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) {
1415 for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx();
1416 Idx <= Edx; ++Idx)
1417 IndexToIncludedCand[Idx].insert(V: &IRCand);
1418 // Add mapping of candidate to the overall similarity group number.
1419 CandToGroup.insert(
1420 KV: std::make_pair(x: &IRCand, y: SimilarityCandidates->size() - 1));
1421 }
1422 }
1423 }
1424
1425 CandsForRepSubstring.clear();
1426 StructuralGroups.clear();
1427 NewCandidateGroups.clear();
1428 }
1429}
1430
1431SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
1432 ArrayRef<std::unique_ptr<Module>> Modules) {
1433 resetSimilarityCandidates();
1434
1435 std::vector<IRInstructionData *> InstrList;
1436 std::vector<unsigned> IntegerMapping;
1437 Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1438 Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1439 Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1440 Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1441 Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1442
1443 populateMapper(Modules, InstrList, IntegerMapping);
1444 findCandidates(InstrList, IntegerMapping);
1445
1446 return *SimilarityCandidates;
1447}
1448
1449SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
1450 resetSimilarityCandidates();
1451 Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1452 Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1453 Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1454 Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1455 Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1456
1457 std::vector<IRInstructionData *> InstrList;
1458 std::vector<unsigned> IntegerMapping;
1459
1460 populateMapper(M, InstrList, IntegerMapping);
1461 findCandidates(InstrList, IntegerMapping);
1462
1463 return *SimilarityCandidates;
1464}
1465
1466INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
1467 "ir-similarity-identifier", false, true)
1468
1469IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
1470 : ModulePass(ID) {}
1471
1472bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
1473 IRSI.reset(p: new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1474 MatchCallsByName, !DisableIntrinsics,
1475 false));
1476 return false;
1477}
1478
1479bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
1480 IRSI.reset();
1481 return false;
1482}
1483
1484bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
1485 IRSI->findSimilarity(M);
1486 return false;
1487}
1488
1489AnalysisKey IRSimilarityAnalysis::Key;
1490IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
1491 ModuleAnalysisManager &) {
1492 auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1493 MatchCallsByName, !DisableIntrinsics,
1494 false);
1495 IRSI.findSimilarity(M);
1496 return IRSI;
1497}
1498
1499PreservedAnalyses
1500IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
1501 IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(IR&: M);
1502 std::optional<SimilarityGroupList> &SimilarityCandidatesOpt =
1503 IRSI.getSimilarity();
1504
1505 for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
1506 OS << CandVec.size() << " candidates of length "
1507 << CandVec.begin()->getLength() << ". Found in: \n";
1508 for (IRSimilarityCandidate &Cand : CandVec) {
1509 OS << " Function: " << Cand.front()->Inst->getFunction()->getName().str()
1510 << ", Basic Block: ";
1511 if (Cand.front()->Inst->getParent()->getName().str() == "")
1512 OS << "(unnamed)";
1513 else
1514 OS << Cand.front()->Inst->getParent()->getName().str();
1515 OS << "\n Start Instruction: ";
1516 Cand.frontInstruction()->print(O&: OS);
1517 OS << "\n End Instruction: ";
1518 Cand.backInstruction()->print(O&: OS);
1519 OS << "\n";
1520 }
1521 }
1522
1523 return PreservedAnalyses::all();
1524}
1525
1526char IRSimilarityIdentifierWrapperPass::ID = 0;
1527