1//===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// Implementation file for the IRSimilarityIdentifier for identifying
11// similarities in IR including the IRInstructionMapper.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Analysis/IRSimilarityIdentifier.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/SetOperations.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/Operator.h"
20#include "llvm/IR/User.h"
21#include "llvm/InitializePasses.h"
22#include "llvm/Support/SuffixTree.h"
23
24using namespace llvm;
25using namespace IRSimilarity;
26
27namespace llvm {
28cl::opt<bool>
29 DisableBranches("no-ir-sim-branch-matching", cl::init(Val: false),
30 cl::ReallyHidden,
31 cl::desc("disable similarity matching, and outlining, "
32 "across branches for debugging purposes."));
33
34cl::opt<bool>
35 DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(Val: false),
36 cl::ReallyHidden,
37 cl::desc("disable outlining indirect calls."));
38
39cl::opt<bool>
40 MatchCallsByName("ir-sim-calls-by-name", cl::init(Val: false), cl::ReallyHidden,
41 cl::desc("only allow matching call instructions if the "
42 "name and type signature match."));
43
44cl::opt<bool>
45 DisableIntrinsics("no-ir-sim-intrinsics", cl::init(Val: false), cl::ReallyHidden,
46 cl::desc("Don't match or outline intrinsics"));
47} // namespace llvm
48
49IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
50 IRInstructionDataList &IDList)
51 : Inst(&I), Legal(Legality), IDL(&IDList) {
52 initializeInstruction();
53}
54
55void IRInstructionData::initializeInstruction() {
56 // We check for whether we have a comparison instruction. If it is, we
57 // find the "less than" version of the predicate for consistency for
58 // comparison instructions throught the program.
59 if (CmpInst *C = dyn_cast<CmpInst>(Val: Inst)) {
60 CmpInst::Predicate Predicate = predicateForConsistency(CI: C);
61 if (Predicate != C->getPredicate())
62 RevisedPredicate = Predicate;
63 }
64
65 // Here we collect the operands and their types for determining whether
66 // the structure of the operand use matches between two different candidates.
67 for (Use &OI : Inst->operands()) {
68 if (isa<CmpInst>(Val: Inst) && RevisedPredicate) {
69 // If we have a CmpInst where the predicate is reversed, it means the
70 // operands must be reversed as well.
71 OperVals.insert(I: OperVals.begin(), Elt: OI.get());
72 continue;
73 }
74
75 OperVals.push_back(Elt: OI.get());
76 }
77
78 // We capture the incoming BasicBlocks as values as well as the incoming
79 // Values in order to check for structural similarity.
80 if (PHINode *PN = dyn_cast<PHINode>(Val: Inst))
81 for (BasicBlock *BB : PN->blocks())
82 OperVals.push_back(Elt: BB);
83}
84
85IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
86 : IDL(&IDList) {}
87
88void IRInstructionData::setBranchSuccessors(
89 DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
90 assert(isa<BranchInst>(Inst) && "Instruction must be branch");
91
92 BranchInst *BI = cast<BranchInst>(Val: Inst);
93 DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
94
95 BBNumIt = BasicBlockToInteger.find(Val: BI->getParent());
96 assert(BBNumIt != BasicBlockToInteger.end() &&
97 "Could not find location for BasicBlock!");
98
99 int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
100
101 for (Value *V : getBlockOperVals()) {
102 BasicBlock *Successor = cast<BasicBlock>(Val: V);
103 BBNumIt = BasicBlockToInteger.find(Val: Successor);
104 assert(BBNumIt != BasicBlockToInteger.end() &&
105 "Could not find number for BasicBlock!");
106 int OtherBlockNumber = static_cast<int>(BBNumIt->second);
107
108 int Relative = OtherBlockNumber - CurrentBlockNumber;
109 RelativeBlockLocations.push_back(Elt: Relative);
110 }
111}
112
113ArrayRef<Value *> IRInstructionData::getBlockOperVals() {
114 assert((isa<BranchInst>(Inst) ||
115 isa<PHINode>(Inst)) && "Instruction must be branch or PHINode");
116
117 if (BranchInst *BI = dyn_cast<BranchInst>(Val: Inst))
118 return ArrayRef<Value *>(
119 std::next(x: OperVals.begin(), n: BI->isConditional() ? 1 : 0),
120 OperVals.end()
121 );
122
123 if (PHINode *PN = dyn_cast<PHINode>(Val: Inst))
124 return ArrayRef<Value *>(
125 std::next(x: OperVals.begin(), n: PN->getNumIncomingValues()),
126 OperVals.end()
127 );
128
129 return ArrayRef<Value *>();
130}
131
132void IRInstructionData::setCalleeName(bool MatchByName) {
133 CallInst *CI = dyn_cast<CallInst>(Val: Inst);
134 assert(CI && "Instruction must be call");
135
136 CalleeName = "";
137 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
138 // To hash intrinsics, we use the opcode, and types like the other
139 // instructions, but also, the Intrinsic ID, and the Name of the
140 // intrinsic.
141 Intrinsic::ID IntrinsicID = II->getIntrinsicID();
142 FunctionType *FT = II->getFunctionType();
143 // If there is an overloaded name, we have to use the complex version
144 // of getName to get the entire string.
145 if (Intrinsic::isOverloaded(id: IntrinsicID))
146 CalleeName =
147 Intrinsic::getName(Id: IntrinsicID, Tys: FT->params(), M: II->getModule(), FT);
148 // If there is not an overloaded name, we only need to use this version.
149 else
150 CalleeName = Intrinsic::getName(id: IntrinsicID).str();
151
152 return;
153 }
154
155 if (!CI->isIndirectCall() && MatchByName)
156 CalleeName = CI->getCalledFunction()->getName().str();
157}
158
159void IRInstructionData::setPHIPredecessors(
160 DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
161 assert(isa<PHINode>(Inst) && "Instruction must be phi node");
162
163 PHINode *PN = cast<PHINode>(Val: Inst);
164 DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
165
166 BBNumIt = BasicBlockToInteger.find(Val: PN->getParent());
167 assert(BBNumIt != BasicBlockToInteger.end() &&
168 "Could not find location for BasicBlock!");
169
170 int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
171
172 // Convert the incoming blocks of the PHINode to an integer value, based on
173 // the relative distances between the current block and the incoming block.
174 for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
175 BasicBlock *Incoming = PN->getIncomingBlock(i: Idx);
176 BBNumIt = BasicBlockToInteger.find(Val: Incoming);
177 assert(BBNumIt != BasicBlockToInteger.end() &&
178 "Could not find number for BasicBlock!");
179 int OtherBlockNumber = static_cast<int>(BBNumIt->second);
180
181 int Relative = OtherBlockNumber - CurrentBlockNumber;
182 RelativeBlockLocations.push_back(Elt: Relative);
183 }
184}
185
186CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
187 switch (CI->getPredicate()) {
188 case CmpInst::FCMP_OGT:
189 case CmpInst::FCMP_UGT:
190 case CmpInst::FCMP_OGE:
191 case CmpInst::FCMP_UGE:
192 case CmpInst::ICMP_SGT:
193 case CmpInst::ICMP_UGT:
194 case CmpInst::ICMP_SGE:
195 case CmpInst::ICMP_UGE:
196 return CI->getSwappedPredicate();
197 default:
198 return CI->getPredicate();
199 }
200}
201
202CmpInst::Predicate IRInstructionData::getPredicate() const {
203 assert(isa<CmpInst>(Inst) &&
204 "Can only get a predicate from a compare instruction");
205
206 if (RevisedPredicate)
207 return *RevisedPredicate;
208
209 return cast<CmpInst>(Val: Inst)->getPredicate();
210}
211
212StringRef IRInstructionData::getCalleeName() const {
213 assert(isa<CallInst>(Inst) &&
214 "Can only get a name from a call instruction");
215
216 assert(CalleeName && "CalleeName has not been set");
217
218 return *CalleeName;
219}
220
221bool IRSimilarity::isClose(const IRInstructionData &A,
222 const IRInstructionData &B) {
223
224 if (!A.Legal || !B.Legal)
225 return false;
226
227 // Check if we are performing the same sort of operation on the same types
228 // but not on the same values.
229 if (!A.Inst->isSameOperationAs(I: B.Inst)) {
230 // If there is a predicate, this means that either there is a swapped
231 // predicate, or that the types are different, we want to make sure that
232 // the predicates are equivalent via swapping.
233 if (isa<CmpInst>(Val: A.Inst) && isa<CmpInst>(Val: B.Inst)) {
234
235 if (A.getPredicate() != B.getPredicate())
236 return false;
237
238 // If the predicates are the same via swap, make sure that the types are
239 // still the same.
240 auto ZippedTypes = zip(t: A.OperVals, u: B.OperVals);
241
242 return all_of(
243 Range&: ZippedTypes, P: [](std::tuple<llvm::Value *, llvm::Value *> R) {
244 return std::get<0>(t&: R)->getType() == std::get<1>(t&: R)->getType();
245 });
246 }
247
248 return false;
249 }
250
251 // Since any GEP Instruction operands after the first operand cannot be
252 // defined by a register, we must make sure that the operands after the first
253 // are the same in the two instructions
254 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: A.Inst)) {
255 auto *OtherGEP = cast<GetElementPtrInst>(Val: B.Inst);
256
257 // If the instructions do not have the same inbounds restrictions, we do
258 // not consider them the same.
259 if (GEP->isInBounds() != OtherGEP->isInBounds())
260 return false;
261
262 auto ZippedOperands = zip(t: GEP->indices(), u: OtherGEP->indices());
263
264 // We increment here since we do not care about the first instruction,
265 // we only care about the following operands since they must be the
266 // exact same to be considered similar.
267 return all_of(Range: drop_begin(RangeOrContainer&: ZippedOperands),
268 P: [](std::tuple<llvm::Use &, llvm::Use &> R) {
269 return std::get<0>(t&: R) == std::get<1>(t&: R);
270 });
271 }
272
273 // If the instructions are functions calls, we make sure that the function
274 // name is the same. We already know that the types are since is
275 // isSameOperationAs is true.
276 if (isa<CallInst>(Val: A.Inst) && isa<CallInst>(Val: B.Inst)) {
277 if (A.getCalleeName() != B.getCalleeName())
278 return false;
279 }
280
281 if (isa<BranchInst>(Val: A.Inst) && isa<BranchInst>(Val: B.Inst) &&
282 A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
283 return false;
284
285 return true;
286}
287
288// TODO: This is the same as the MachineOutliner, and should be consolidated
289// into the same interface.
290void IRInstructionMapper::convertToUnsignedVec(
291 BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
292 std::vector<unsigned> &IntegerMapping) {
293 BasicBlock::iterator It = BB.begin();
294
295 std::vector<unsigned> IntegerMappingForBB;
296 std::vector<IRInstructionData *> InstrListForBB;
297
298 for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
299 switch (InstClassifier.visit(I&: *It)) {
300 case InstrType::Legal:
301 mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
302 break;
303 case InstrType::Illegal:
304 mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
305 break;
306 case InstrType::Invisible:
307 AddedIllegalLastTime = false;
308 break;
309 }
310 }
311
312 if (AddedIllegalLastTime)
313 mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, End: true);
314 for (IRInstructionData *ID : InstrListForBB)
315 this->IDL->push_back(Node&: *ID);
316 llvm::append_range(C&: InstrList, R&: InstrListForBB);
317 llvm::append_range(C&: IntegerMapping, R&: IntegerMappingForBB);
318}
319
320// TODO: This is the same as the MachineOutliner, and should be consolidated
321// into the same interface.
322unsigned IRInstructionMapper::mapToLegalUnsigned(
323 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
324 std::vector<IRInstructionData *> &InstrListForBB) {
325 // We added something legal, so we should unset the AddedLegalLastTime
326 // flag.
327 AddedIllegalLastTime = false;
328
329 // If we have at least two adjacent legal instructions (which may have
330 // invisible instructions in between), remember that.
331 if (CanCombineWithPrevInstr)
332 HaveLegalRange = true;
333 CanCombineWithPrevInstr = true;
334
335 // Get the integer for this instruction or give it the current
336 // LegalInstrNumber.
337 IRInstructionData *ID = allocateIRInstructionData(I&: *It, Legality: true, IDL&: *IDL);
338 InstrListForBB.push_back(x: ID);
339
340 if (isa<BranchInst>(Val: *It))
341 ID->setBranchSuccessors(BasicBlockToInteger);
342
343 if (isa<CallInst>(Val: *It))
344 ID->setCalleeName(EnableMatchCallsByName);
345
346 if (isa<PHINode>(Val: *It))
347 ID->setPHIPredecessors(BasicBlockToInteger);
348
349 // Add to the instruction list
350 bool WasInserted;
351 DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
352 ResultIt;
353 std::tie(args&: ResultIt, args&: WasInserted) =
354 InstructionIntegerMap.insert(KV: std::make_pair(x&: ID, y&: LegalInstrNumber));
355 unsigned INumber = ResultIt->second;
356
357 // There was an insertion.
358 if (WasInserted)
359 LegalInstrNumber++;
360
361 IntegerMappingForBB.push_back(x: INumber);
362
363 // Make sure we don't overflow or use any integers reserved by the DenseMap.
364 assert(LegalInstrNumber < IllegalInstrNumber &&
365 "Instruction mapping overflow!");
366
367 assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
368 "Tried to assign DenseMap tombstone or empty key to instruction.");
369 assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
370 "Tried to assign DenseMap tombstone or empty key to instruction.");
371
372 return INumber;
373}
374
375IRInstructionData *
376IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
377 IRInstructionDataList &IDL) {
378 return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
379}
380
381IRInstructionData *
382IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) {
383 return new (InstDataAllocator->Allocate()) IRInstructionData(IDL);
384}
385
386IRInstructionDataList *
387IRInstructionMapper::allocateIRInstructionDataList() {
388 return new (IDLAllocator->Allocate()) IRInstructionDataList();
389}
390
391// TODO: This is the same as the MachineOutliner, and should be consolidated
392// into the same interface.
393unsigned IRInstructionMapper::mapToIllegalUnsigned(
394 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
395 std::vector<IRInstructionData *> &InstrListForBB, bool End) {
396 // Can't combine an illegal instruction. Set the flag.
397 CanCombineWithPrevInstr = false;
398
399 // Only add one illegal number per range of legal numbers.
400 if (AddedIllegalLastTime)
401 return IllegalInstrNumber;
402
403 IRInstructionData *ID = nullptr;
404 if (!End)
405 ID = allocateIRInstructionData(I&: *It, Legality: false, IDL&: *IDL);
406 else
407 ID = allocateIRInstructionData(IDL&: *IDL);
408 InstrListForBB.push_back(x: ID);
409
410 // Remember that we added an illegal number last time.
411 AddedIllegalLastTime = true;
412 unsigned INumber = IllegalInstrNumber;
413 IntegerMappingForBB.push_back(x: IllegalInstrNumber--);
414
415 assert(LegalInstrNumber < IllegalInstrNumber &&
416 "Instruction mapping overflow!");
417
418 assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
419 "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
420
421 assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
422 "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
423
424 return INumber;
425}
426
427IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
428 IRInstructionData *FirstInstIt,
429 IRInstructionData *LastInstIt)
430 : StartIdx(StartIdx), Len(Len) {
431
432 assert(FirstInstIt != nullptr && "Instruction is nullptr!");
433 assert(LastInstIt != nullptr && "Instruction is nullptr!");
434 assert(StartIdx + Len > StartIdx &&
435 "Overflow for IRSimilarityCandidate range?");
436 assert(Len - 1 == static_cast<unsigned>(std::distance(
437 iterator(FirstInstIt), iterator(LastInstIt))) &&
438 "Length of the first and last IRInstructionData do not match the "
439 "given length");
440
441 // We iterate over the given instructions, and map each unique value
442 // to a unique number in the IRSimilarityCandidate ValueToNumber and
443 // NumberToValue maps. A constant get its own value globally, the individual
444 // uses of the constants are not considered to be unique.
445 //
446 // IR: Mapping Added:
447 // %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
448 // %add2 = add i32 %a, %1 %add2 -> 4
449 // %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
450 //
451 // when replace with global values, starting from 1, would be
452 //
453 // 3 = add i32 1, 2
454 // 4 = add i32 1, 3
455 // 6 = add i32 5, 2
456 unsigned LocalValNumber = 1;
457 IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
458 for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
459 // Map the operand values to an unsigned integer if it does not already
460 // have an unsigned integer assigned to it.
461 for (Value *Arg : ID->OperVals)
462 if (!ValueToNumber.contains(Val: Arg)) {
463 ValueToNumber.try_emplace(Key: Arg, Args&: LocalValNumber);
464 NumberToValue.try_emplace(Key: LocalValNumber, Args&: Arg);
465 LocalValNumber++;
466 }
467
468 // Mapping the instructions to an unsigned integer if it is not already
469 // exist in the mapping.
470 if (!ValueToNumber.contains(Val: ID->Inst)) {
471 ValueToNumber.try_emplace(Key: ID->Inst, Args&: LocalValNumber);
472 NumberToValue.try_emplace(Key: LocalValNumber, Args&: ID->Inst);
473 LocalValNumber++;
474 }
475 }
476
477 // Setting the first and last instruction data pointers for the candidate. If
478 // we got through the entire for loop without hitting an assert, we know
479 // that both of these instructions are not nullptrs.
480 FirstInst = FirstInstIt;
481 LastInst = LastInstIt;
482
483 // Add the basic blocks contained in the set into the global value numbering.
484 DenseSet<BasicBlock *> BBSet;
485 getBasicBlocks(BBSet);
486 for (BasicBlock *BB : BBSet) {
487 if (ValueToNumber.contains(Val: BB))
488 continue;
489
490 ValueToNumber.try_emplace(Key: BB, Args&: LocalValNumber);
491 NumberToValue.try_emplace(Key: LocalValNumber, Args&: BB);
492 LocalValNumber++;
493 }
494}
495
496bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
497 const IRSimilarityCandidate &B) {
498 if (A.getLength() != B.getLength())
499 return false;
500
501 auto InstrDataForBoth =
502 zip(t: make_range(x: A.begin(), y: A.end()), u: make_range(x: B.begin(), y: B.end()));
503
504 return all_of(Range&: InstrDataForBoth,
505 P: [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
506 IRInstructionData &A = std::get<0>(t&: R);
507 IRInstructionData &B = std::get<1>(t&: R);
508 if (!A.Legal || !B.Legal)
509 return false;
510 return isClose(A, B);
511 });
512}
513
514/// Determine if one or more of the assigned global value numbers for the
515/// operands in \p TargetValueNumbers is in the current mapping set for operand
516/// numbers in \p SourceOperands. The set of possible corresponding global
517/// value numbers are replaced with the most recent version of compatible
518/// values.
519///
520/// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
521/// value number for the source IRInstructionCandidate.
522/// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
523/// IRSimilarityCandidate global value numbers to a set of possible numbers in
524/// the target.
525/// \param [in] SourceOperands - The operands in the original
526/// IRSimilarityCandidate in the current instruction.
527/// \param [in] TargetValueNumbers - The global value numbers of the operands in
528/// the corresponding Instruction in the other IRSimilarityCandidate.
529/// \returns true if there exists a possible mapping between the source
530/// Instruction operands and the target Instruction operands, and false if not.
531static bool checkNumberingAndReplaceCommutative(
532 const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
533 DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
534 ArrayRef<Value *> &SourceOperands,
535 DenseSet<unsigned> &TargetValueNumbers){
536
537 DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
538
539 unsigned ArgVal;
540 bool WasInserted;
541
542 // Iterate over the operands in the source IRSimilarityCandidate to determine
543 // whether there exists an operand in the other IRSimilarityCandidate that
544 // creates a valid mapping of Value to Value between the
545 // IRSimilarityCaniddates.
546 for (Value *V : SourceOperands) {
547 ArgVal = SourceValueToNumberMapping.find(Val: V)->second;
548
549 // Instead of finding a current mapping, we attempt to insert a set.
550 std::tie(args&: ValueMappingIt, args&: WasInserted) = CurrentSrcTgtNumberMapping.insert(
551 KV: std::make_pair(x&: ArgVal, y&: TargetValueNumbers));
552
553 // We need to iterate over the items in other IRSimilarityCandidate's
554 // Instruction to determine whether there is a valid mapping of
555 // Value to Value.
556 DenseSet<unsigned> NewSet;
557 for (unsigned &Curr : ValueMappingIt->second)
558 // If we can find the value in the mapping, we add it to the new set.
559 if (TargetValueNumbers.contains(V: Curr))
560 NewSet.insert(V: Curr);
561
562 // If we could not find a Value, return 0.
563 if (NewSet.empty())
564 return false;
565
566 // Otherwise replace the old mapping with the newly constructed one.
567 if (NewSet.size() != ValueMappingIt->second.size())
568 ValueMappingIt->second.swap(RHS&: NewSet);
569
570 // We have reached no conclusions about the mapping, and cannot remove
571 // any items from the other operands, so we move to check the next operand.
572 if (ValueMappingIt->second.size() != 1)
573 continue;
574
575 unsigned ValToRemove = *ValueMappingIt->second.begin();
576 // When there is only one item left in the mapping for and operand, remove
577 // the value from the other operands. If it results in there being no
578 // mapping, return false, it means the mapping is wrong
579 for (Value *InnerV : SourceOperands) {
580 if (V == InnerV)
581 continue;
582
583 unsigned InnerVal = SourceValueToNumberMapping.find(Val: InnerV)->second;
584 ValueMappingIt = CurrentSrcTgtNumberMapping.find(Val: InnerVal);
585 if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
586 continue;
587
588 ValueMappingIt->second.erase(V: ValToRemove);
589 if (ValueMappingIt->second.empty())
590 return false;
591 }
592 }
593
594 return true;
595}
596
597/// Determine if operand number \p TargetArgVal is in the current mapping set
598/// for operand number \p SourceArgVal.
599///
600/// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
601/// value numbers from source IRSimilarityCandidate to target
602/// IRSimilarityCandidate.
603/// \param [in] SourceArgVal The global value number for an operand in the
604/// in the original candidate.
605/// \param [in] TargetArgVal The global value number for the corresponding
606/// operand in the other candidate.
607/// \returns True if there exists a mapping and false if not.
608bool checkNumberingAndReplace(
609 DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
610 unsigned SourceArgVal, unsigned TargetArgVal) {
611 // We are given two unsigned integers representing the global values of
612 // the operands in different IRSimilarityCandidates and a current mapping
613 // between the two.
614 //
615 // Source Operand GVN: 1
616 // Target Operand GVN: 2
617 // CurrentMapping: {1: {1, 2}}
618 //
619 // Since we have mapping, and the target operand is contained in the set, we
620 // update it to:
621 // CurrentMapping: {1: {2}}
622 // and can return true. But, if the mapping was
623 // CurrentMapping: {1: {3}}
624 // we would return false.
625
626 bool WasInserted;
627 DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
628
629 std::tie(args&: Val, args&: WasInserted) = CurrentSrcTgtNumberMapping.insert(
630 KV: std::make_pair(x&: SourceArgVal, y: DenseSet<unsigned>({TargetArgVal})));
631
632 // If we created a new mapping, then we are done.
633 if (WasInserted)
634 return true;
635
636 // If there is more than one option in the mapping set, and the target value
637 // is included in the mapping set replace that set with one that only includes
638 // the target value, as it is the only valid mapping via the non commutative
639 // instruction.
640
641 DenseSet<unsigned> &TargetSet = Val->second;
642 if (TargetSet.size() > 1 && TargetSet.contains(V: TargetArgVal)) {
643 TargetSet.clear();
644 TargetSet.insert(V: TargetArgVal);
645 return true;
646 }
647
648 // Return true if we can find the value in the set.
649 return TargetSet.contains(V: TargetArgVal);
650}
651
652bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
653 OperandMapping A, OperandMapping B) {
654 // Iterators to keep track of where we are in the operands for each
655 // Instruction.
656 ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
657 ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
658 unsigned OperandLength = A.OperVals.size();
659
660 // For each operand, get the value numbering and ensure it is consistent.
661 for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
662 unsigned OperValA = A.IRSC.ValueToNumber.find(Val: *VItA)->second;
663 unsigned OperValB = B.IRSC.ValueToNumber.find(Val: *VItB)->second;
664
665 // Attempt to add a set with only the target value. If there is no mapping
666 // we can create it here.
667 //
668 // For an instruction like a subtraction:
669 // IRSimilarityCandidateA: IRSimilarityCandidateB:
670 // %resultA = sub %a, %b %resultB = sub %d, %e
671 //
672 // We map %a -> %d and %b -> %e.
673 //
674 // And check to see whether their mapping is consistent in
675 // checkNumberingAndReplace.
676
677 if (!checkNumberingAndReplace(CurrentSrcTgtNumberMapping&: A.ValueNumberMapping, SourceArgVal: OperValA, TargetArgVal: OperValB))
678 return false;
679
680 if (!checkNumberingAndReplace(CurrentSrcTgtNumberMapping&: B.ValueNumberMapping, SourceArgVal: OperValB, TargetArgVal: OperValA))
681 return false;
682 }
683 return true;
684}
685
686bool IRSimilarityCandidate::compareCommutativeOperandMapping(
687 OperandMapping A, OperandMapping B) {
688 DenseSet<unsigned> ValueNumbersA;
689 DenseSet<unsigned> ValueNumbersB;
690
691 ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
692 ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
693 unsigned OperandLength = A.OperVals.size();
694
695 // Find the value number sets for the operands.
696 for (unsigned Idx = 0; Idx < OperandLength;
697 Idx++, VItA++, VItB++) {
698 ValueNumbersA.insert(V: A.IRSC.ValueToNumber.find(Val: *VItA)->second);
699 ValueNumbersB.insert(V: B.IRSC.ValueToNumber.find(Val: *VItB)->second);
700 }
701
702 // Iterate over the operands in the first IRSimilarityCandidate and make sure
703 // there exists a possible mapping with the operands in the second
704 // IRSimilarityCandidate.
705 if (!checkNumberingAndReplaceCommutative(SourceValueToNumberMapping: A.IRSC.ValueToNumber,
706 CurrentSrcTgtNumberMapping&: A.ValueNumberMapping, SourceOperands&: A.OperVals,
707 TargetValueNumbers&: ValueNumbersB))
708 return false;
709
710 // Iterate over the operands in the second IRSimilarityCandidate and make sure
711 // there exists a possible mapping with the operands in the first
712 // IRSimilarityCandidate.
713 if (!checkNumberingAndReplaceCommutative(SourceValueToNumberMapping: B.IRSC.ValueToNumber,
714 CurrentSrcTgtNumberMapping&: B.ValueNumberMapping, SourceOperands&: B.OperVals,
715 TargetValueNumbers&: ValueNumbersA))
716 return false;
717
718 return true;
719}
720
721bool IRSimilarityCandidate::compareAssignmentMapping(
722 const unsigned InstValA, const unsigned &InstValB,
723 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
724 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
725 DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
726 bool WasInserted;
727 std::tie(args&: ValueMappingIt, args&: WasInserted) = ValueNumberMappingA.insert(
728 KV: std::make_pair(x: InstValA, y: DenseSet<unsigned>({InstValB})));
729 if (!WasInserted && !ValueMappingIt->second.contains(V: InstValB))
730 return false;
731 else if (ValueMappingIt->second.size() != 1) {
732 for (unsigned OtherVal : ValueMappingIt->second) {
733 if (OtherVal == InstValB)
734 continue;
735 if (!ValueNumberMappingA.contains(Val: OtherVal))
736 continue;
737 if (!ValueNumberMappingA[OtherVal].contains(V: InstValA))
738 continue;
739 ValueNumberMappingA[OtherVal].erase(V: InstValA);
740 }
741 ValueNumberMappingA.erase(I: ValueMappingIt);
742 std::tie(args&: ValueMappingIt, args&: WasInserted) = ValueNumberMappingA.insert(
743 KV: std::make_pair(x: InstValA, y: DenseSet<unsigned>({InstValB})));
744 }
745
746 return true;
747}
748
749bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A,
750 RelativeLocMapping B) {
751 // Get the basic blocks the label refers to.
752 BasicBlock *ABB = cast<BasicBlock>(Val: A.OperVal);
753 BasicBlock *BBB = cast<BasicBlock>(Val: B.OperVal);
754
755 // Get the basic blocks contained in each region.
756 DenseSet<BasicBlock *> BasicBlockA;
757 DenseSet<BasicBlock *> BasicBlockB;
758 A.IRSC.getBasicBlocks(BBSet&: BasicBlockA);
759 B.IRSC.getBasicBlocks(BBSet&: BasicBlockB);
760
761 // Determine if the block is contained in the region.
762 bool AContained = BasicBlockA.contains(V: ABB);
763 bool BContained = BasicBlockB.contains(V: BBB);
764
765 // Both blocks need to be contained in the region, or both need to be outside
766 // the region.
767 if (AContained != BContained)
768 return false;
769
770 // If both are contained, then we need to make sure that the relative
771 // distance to the target blocks are the same.
772 if (AContained)
773 return A.RelativeLocation == B.RelativeLocation;
774 return true;
775}
776
777bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
778 const IRSimilarityCandidate &B) {
779 DenseMap<unsigned, DenseSet<unsigned>> MappingA;
780 DenseMap<unsigned, DenseSet<unsigned>> MappingB;
781 return IRSimilarityCandidate::compareStructure(A, B, ValueNumberMappingA&: MappingA, ValueNumberMappingB&: MappingB);
782}
783
784typedef detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
785 SmallVector<int, 4> &, ArrayRef<Value *> &,
786 ArrayRef<Value *> &>
787 ZippedRelativeLocationsT;
788
789bool IRSimilarityCandidate::compareStructure(
790 const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
791 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
792 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
793 if (A.getLength() != B.getLength())
794 return false;
795
796 if (A.ValueToNumber.size() != B.ValueToNumber.size())
797 return false;
798
799 iterator ItA = A.begin();
800 iterator ItB = B.begin();
801
802 // These ValueNumber Mapping sets create a create a mapping between the values
803 // in one candidate to values in the other candidate. If we create a set with
804 // one element, and that same element maps to the original element in the
805 // candidate we have a good mapping.
806
807 // Iterate over the instructions contained in each candidate
808 unsigned SectionLength = A.getStartIdx() + A.getLength();
809 for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
810 ItA++, ItB++, Loc++) {
811 // Make sure the instructions are similar to one another.
812 if (!isClose(A: *ItA, B: *ItB))
813 return false;
814
815 Instruction *IA = ItA->Inst;
816 Instruction *IB = ItB->Inst;
817
818 if (!ItA->Legal || !ItB->Legal)
819 return false;
820
821 // Get the operand sets for the instructions.
822 ArrayRef<Value *> OperValsA = ItA->OperVals;
823 ArrayRef<Value *> OperValsB = ItB->OperVals;
824
825 unsigned InstValA = A.ValueToNumber.find(Val: IA)->second;
826 unsigned InstValB = B.ValueToNumber.find(Val: IB)->second;
827
828 // Ensure that the mappings for the instructions exists.
829 if (!compareAssignmentMapping(InstValA, InstValB, ValueNumberMappingA,
830 ValueNumberMappingB))
831 return false;
832
833 if (!compareAssignmentMapping(InstValA: InstValB, InstValB: InstValA, ValueNumberMappingA&: ValueNumberMappingB,
834 ValueNumberMappingB&: ValueNumberMappingA))
835 return false;
836
837 // We have different paths for commutative instructions and non-commutative
838 // instructions since commutative instructions could allow multiple mappings
839 // to certain values.
840 if (IA->isCommutative() && !isa<FPMathOperator>(Val: IA) &&
841 !isa<IntrinsicInst>(Val: IA)) {
842 if (!compareCommutativeOperandMapping(
843 A: {.IRSC: A, .OperVals: OperValsA, .ValueNumberMapping: ValueNumberMappingA},
844 B: {.IRSC: B, .OperVals: OperValsB, .ValueNumberMapping: ValueNumberMappingB}))
845 return false;
846 continue;
847 }
848
849 // Handle the non-commutative cases.
850 if (!compareNonCommutativeOperandMapping(
851 A: {.IRSC: A, .OperVals: OperValsA, .ValueNumberMapping: ValueNumberMappingA},
852 B: {.IRSC: B, .OperVals: OperValsB, .ValueNumberMapping: ValueNumberMappingB}))
853 return false;
854
855 // Here we check that between two corresponding instructions,
856 // when referring to a basic block in the same region, the
857 // relative locations are the same. And, that the instructions refer to
858 // basic blocks outside the region in the same corresponding locations.
859
860 // We are able to make the assumption about blocks outside of the region
861 // since the target block labels are considered values and will follow the
862 // same number matching that we defined for the other instructions in the
863 // region. So, at this point, in each location we target a specific block
864 // outside the region, we are targeting a corresponding block in each
865 // analagous location in the region we are comparing to.
866 if (!(isa<BranchInst>(Val: IA) && isa<BranchInst>(Val: IB)) &&
867 !(isa<PHINode>(Val: IA) && isa<PHINode>(Val: IB)))
868 continue;
869
870 SmallVector<int, 4> &RelBlockLocsA = ItA->RelativeBlockLocations;
871 SmallVector<int, 4> &RelBlockLocsB = ItB->RelativeBlockLocations;
872 ArrayRef<Value *> ABL = ItA->getBlockOperVals();
873 ArrayRef<Value *> BBL = ItB->getBlockOperVals();
874
875 // Check to make sure that the number of operands, and branching locations
876 // between BranchInsts is the same.
877 if (RelBlockLocsA.size() != RelBlockLocsB.size() &&
878 ABL.size() != BBL.size())
879 return false;
880
881 assert(RelBlockLocsA.size() == ABL.size() &&
882 "Block information vectors not the same size.");
883 assert(RelBlockLocsB.size() == BBL.size() &&
884 "Block information vectors not the same size.");
885
886 ZippedRelativeLocationsT ZippedRelativeLocations =
887 zip(t&: RelBlockLocsA, u&: RelBlockLocsB, args&: ABL, args&: BBL);
888 if (any_of(Range&: ZippedRelativeLocations,
889 P: [&A, &B](std::tuple<int, int, Value *, Value *> R) {
890 return !checkRelativeLocations(
891 A: {.IRSC: A, .RelativeLocation: std::get<0>(t&: R), .OperVal: std::get<2>(t&: R)},
892 B: {.IRSC: B, .RelativeLocation: std::get<1>(t&: R), .OperVal: std::get<3>(t&: R)});
893 }))
894 return false;
895 }
896 return true;
897}
898
899bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
900 const IRSimilarityCandidate &B) {
901 auto DoesOverlap = [](const IRSimilarityCandidate &X,
902 const IRSimilarityCandidate &Y) {
903 // Check:
904 // XXXXXX X starts before Y ends
905 // YYYYYYY Y starts after X starts
906 return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
907 };
908
909 return DoesOverlap(A, B) || DoesOverlap(B, A);
910}
911
912void IRSimilarityIdentifier::populateMapper(
913 Module &M, std::vector<IRInstructionData *> &InstrList,
914 std::vector<unsigned> &IntegerMapping) {
915
916 std::vector<IRInstructionData *> InstrListForModule;
917 std::vector<unsigned> IntegerMappingForModule;
918 // Iterate over the functions in the module to map each Instruction in each
919 // BasicBlock to an unsigned integer.
920 Mapper.initializeForBBs(M);
921
922 for (Function &F : M) {
923
924 if (F.empty())
925 continue;
926
927 for (BasicBlock &BB : F) {
928
929 // BB has potential to have similarity since it has a size greater than 2
930 // and can therefore match other regions greater than 2. Map it to a list
931 // of unsigned integers.
932 Mapper.convertToUnsignedVec(BB, InstrList&: InstrListForModule,
933 IntegerMapping&: IntegerMappingForModule);
934 }
935
936 BasicBlock::iterator It = F.begin()->end();
937 Mapper.mapToIllegalUnsigned(It, IntegerMappingForBB&: IntegerMappingForModule, InstrListForBB&: InstrListForModule,
938 End: true);
939 if (InstrListForModule.size() > 0)
940 Mapper.IDL->push_back(Node&: *InstrListForModule.back());
941 }
942
943 // Insert the InstrListForModule at the end of the overall InstrList so that
944 // we can have a long InstrList for the entire set of Modules being analyzed.
945 llvm::append_range(C&: InstrList, R&: InstrListForModule);
946 // Do the same as above, but for IntegerMapping.
947 llvm::append_range(C&: IntegerMapping, R&: IntegerMappingForModule);
948}
949
950void IRSimilarityIdentifier::populateMapper(
951 ArrayRef<std::unique_ptr<Module>> &Modules,
952 std::vector<IRInstructionData *> &InstrList,
953 std::vector<unsigned> &IntegerMapping) {
954
955 // Iterate over, and map the instructions in each module.
956 for (const std::unique_ptr<Module> &M : Modules)
957 populateMapper(M&: *M, InstrList, IntegerMapping);
958}
959
960/// From a repeated subsequence, find all the different instances of the
961/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
962/// the IRInstructionData in subsequence.
963///
964/// \param [in] Mapper - The instruction mapper for basic correctness checks.
965/// \param [in] InstrList - The vector that holds the instruction data.
966/// \param [in] IntegerMapping - The vector that holds the mapped integers.
967/// \param [out] CandsForRepSubstring - The vector to store the generated
968/// IRSimilarityCandidates.
969static void createCandidatesFromSuffixTree(
970 const IRInstructionMapper& Mapper, std::vector<IRInstructionData *> &InstrList,
971 std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
972 std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
973
974 unsigned StringLen = RS.Length;
975 if (StringLen < 2)
976 return;
977
978 // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
979 for (const unsigned &StartIdx : RS.StartIndices) {
980 unsigned EndIdx = StartIdx + StringLen - 1;
981
982 // Check that this subsequence does not contain an illegal instruction.
983 bool ContainsIllegal = false;
984 for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
985 unsigned Key = IntegerMapping[CurrIdx];
986 if (Key > Mapper.IllegalInstrNumber) {
987 ContainsIllegal = true;
988 break;
989 }
990 }
991
992 // If we have an illegal instruction, we should not create an
993 // IRSimilarityCandidate for this region.
994 if (ContainsIllegal)
995 continue;
996
997 // We are getting iterators to the instructions in this region of code
998 // by advancing the start and end indices from the start of the
999 // InstrList.
1000 std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
1001 std::advance(i&: StartIt, n: StartIdx);
1002 std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
1003 std::advance(i&: EndIt, n: EndIdx);
1004
1005 CandsForRepSubstring.emplace_back(args: StartIdx, args&: StringLen, args&: *StartIt, args&: *EndIt);
1006 }
1007}
1008
1009void IRSimilarityCandidate::createCanonicalRelationFrom(
1010 IRSimilarityCandidate &SourceCand,
1011 DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
1012 DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
1013 assert(SourceCand.CanonNumToNumber.size() != 0 &&
1014 "Base canonical relationship is empty!");
1015 assert(SourceCand.NumberToCanonNum.size() != 0 &&
1016 "Base canonical relationship is empty!");
1017
1018 assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
1019 assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
1020
1021 DenseSet<unsigned> UsedGVNs;
1022 // Iterate over the mappings provided from this candidate to SourceCand. We
1023 // are then able to map the GVN in this candidate to the same canonical number
1024 // given to the corresponding GVN in SourceCand.
1025 for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
1026 unsigned SourceGVN = GVNMapping.first;
1027
1028 assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
1029
1030 unsigned ResultGVN;
1031 // We need special handling if we have more than one potential value. This
1032 // means that there are at least two GVNs that could correspond to this GVN.
1033 // This could lead to potential swapping later on, so we make a decision
1034 // here to ensure a one-to-one mapping.
1035 if (GVNMapping.second.size() > 1) {
1036 bool Found = false;
1037 for (unsigned Val : GVNMapping.second) {
1038 // We make sure the target value number hasn't already been reserved.
1039 if (UsedGVNs.contains(V: Val))
1040 continue;
1041
1042 // We make sure that the opposite mapping is still consistent.
1043 DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
1044 FromSourceMapping.find(Val);
1045
1046 if (!It->second.contains(V: SourceGVN))
1047 continue;
1048
1049 // We pick the first item that satisfies these conditions.
1050 Found = true;
1051 ResultGVN = Val;
1052 break;
1053 }
1054
1055 assert(Found && "Could not find matching value for source GVN");
1056 (void)Found;
1057
1058 } else
1059 ResultGVN = *GVNMapping.second.begin();
1060
1061 // Whatever GVN is found, we mark it as used.
1062 UsedGVNs.insert(V: ResultGVN);
1063
1064 unsigned CanonNum = *SourceCand.getCanonicalNum(N: ResultGVN);
1065 CanonNumToNumber.insert(KV: std::make_pair(x&: CanonNum, y&: SourceGVN));
1066 NumberToCanonNum.insert(KV: std::make_pair(x&: SourceGVN, y&: CanonNum));
1067 }
1068
1069 DenseSet<BasicBlock *> BBSet;
1070 getBasicBlocks(BBSet);
1071 // Find canonical numbers for the BasicBlocks in the current candidate.
1072 // This is done by finding the corresponding value for the first instruction
1073 // in the block in the current candidate, finding the matching value in the
1074 // source candidate. Then by finding the parent of this value, use the
1075 // canonical number of the block in the source candidate for the canonical
1076 // number in the current candidate.
1077 for (BasicBlock *BB : BBSet) {
1078 unsigned BBGVNForCurrCand = ValueToNumber.find(Val: BB)->second;
1079
1080 // We can skip the BasicBlock if the canonical numbering has already been
1081 // found in a separate instruction.
1082 if (NumberToCanonNum.contains(Val: BBGVNForCurrCand))
1083 continue;
1084
1085 // If the basic block is the starting block, then the shared instruction may
1086 // not be the first instruction in the block, it will be the first
1087 // instruction in the similarity region.
1088 Value *FirstOutlineInst = BB == getStartBB()
1089 ? frontInstruction()
1090 : &*BB->instructionsWithoutDebug().begin();
1091
1092 unsigned FirstInstGVN = *getGVN(V: FirstOutlineInst);
1093 unsigned FirstInstCanonNum = *getCanonicalNum(N: FirstInstGVN);
1094 unsigned SourceGVN = *SourceCand.fromCanonicalNum(N: FirstInstCanonNum);
1095 Value *SourceV = *SourceCand.fromGVN(Num: SourceGVN);
1096 BasicBlock *SourceBB = cast<Instruction>(Val: SourceV)->getParent();
1097 unsigned SourceBBGVN = *SourceCand.getGVN(V: SourceBB);
1098 unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(N: SourceBBGVN);
1099 CanonNumToNumber.insert(KV: std::make_pair(x&: SourceCanonBBGVN, y&: BBGVNForCurrCand));
1100 NumberToCanonNum.insert(KV: std::make_pair(x&: BBGVNForCurrCand, y&: SourceCanonBBGVN));
1101 }
1102}
1103
1104void IRSimilarityCandidate::createCanonicalRelationFrom(
1105 IRSimilarityCandidate &SourceCand, IRSimilarityCandidate &SourceCandLarge,
1106 IRSimilarityCandidate &TargetCandLarge) {
1107 assert(!SourceCand.CanonNumToNumber.empty() &&
1108 "Canonical Relationship is non-empty");
1109 assert(!SourceCand.NumberToCanonNum.empty() &&
1110 "Canonical Relationship is non-empty");
1111
1112 assert(!SourceCandLarge.CanonNumToNumber.empty() &&
1113 "Canonical Relationship is non-empty");
1114 assert(!SourceCandLarge.NumberToCanonNum.empty() &&
1115 "Canonical Relationship is non-empty");
1116
1117 assert(!TargetCandLarge.CanonNumToNumber.empty() &&
1118 "Canonical Relationship is non-empty");
1119 assert(!TargetCandLarge.NumberToCanonNum.empty() &&
1120 "Canonical Relationship is non-empty");
1121
1122 assert(CanonNumToNumber.empty() && "Canonical Relationship is non-empty");
1123 assert(NumberToCanonNum.empty() && "Canonical Relationship is non-empty");
1124
1125 // We're going to use the larger candidates as a "bridge" to create the
1126 // canonical number for the target candidate since we have idetified two
1127 // candidates as subsequences of larger sequences, and therefore must be
1128 // structurally similar.
1129 for (std::pair<Value *, unsigned> &ValueNumPair : ValueToNumber) {
1130 Value *CurrVal = ValueNumPair.first;
1131 unsigned TargetCandGVN = ValueNumPair.second;
1132
1133 // Find the numbering in the large candidate that surrounds the
1134 // current candidate.
1135 std::optional<unsigned> OLargeTargetGVN = TargetCandLarge.getGVN(V: CurrVal);
1136 assert(OLargeTargetGVN.has_value() && "GVN not found for Value");
1137
1138 // Get the canonical numbering in the large target candidate.
1139 std::optional<unsigned> OTargetCandCanon =
1140 TargetCandLarge.getCanonicalNum(N: OLargeTargetGVN.value());
1141 assert(OTargetCandCanon.has_value() &&
1142 "Canononical Number not found for GVN");
1143
1144 // Get the GVN in the large source candidate from the canonical numbering.
1145 std::optional<unsigned> OLargeSourceGVN =
1146 SourceCandLarge.fromCanonicalNum(N: OTargetCandCanon.value());
1147 assert(OLargeSourceGVN.has_value() &&
1148 "GVN Number not found for Canonical Number");
1149
1150 // Get the Value from the GVN in the large source candidate.
1151 std::optional<Value *> OLargeSourceV =
1152 SourceCandLarge.fromGVN(Num: OLargeSourceGVN.value());
1153 assert(OLargeSourceV.has_value() && "Value not found for GVN");
1154
1155 // Get the GVN number for the Value in the source candidate.
1156 std::optional<unsigned> OSourceGVN =
1157 SourceCand.getGVN(V: OLargeSourceV.value());
1158 assert(OSourceGVN.has_value() && "GVN Number not found for Value");
1159
1160 // Get the canonical numbering from the GVN/
1161 std::optional<unsigned> OSourceCanon =
1162 SourceCand.getCanonicalNum(N: OSourceGVN.value());
1163 assert(OSourceCanon.has_value() && "Canon Number not found for GVN");
1164
1165 // Insert the canonical numbering and GVN pair into their respective
1166 // mappings.
1167 CanonNumToNumber.insert(
1168 KV: std::make_pair(x&: OSourceCanon.value(), y&: TargetCandGVN));
1169 NumberToCanonNum.insert(
1170 KV: std::make_pair(x&: TargetCandGVN, y&: OSourceCanon.value()));
1171 }
1172}
1173
1174void IRSimilarityCandidate::createCanonicalMappingFor(
1175 IRSimilarityCandidate &CurrCand) {
1176 assert(CurrCand.CanonNumToNumber.size() == 0 &&
1177 "Canonical Relationship is non-empty");
1178 assert(CurrCand.NumberToCanonNum.size() == 0 &&
1179 "Canonical Relationship is non-empty");
1180
1181 unsigned CanonNum = 0;
1182 // Iterate over the value numbers found, the order does not matter in this
1183 // case.
1184 for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
1185 CurrCand.NumberToCanonNum.insert(KV: std::make_pair(x&: NumToVal.first, y&: CanonNum));
1186 CurrCand.CanonNumToNumber.insert(KV: std::make_pair(x&: CanonNum, y&: NumToVal.first));
1187 CanonNum++;
1188 }
1189}
1190
1191/// Look for larger IRSimilarityCandidates From the previously matched
1192/// IRSimilarityCandidates that fully contain \p CandA or \p CandB. If there is
1193/// an overlap, return a pair of structurally similar, larger
1194/// IRSimilarityCandidates.
1195///
1196/// \param [in] CandA - The first candidate we are trying to determine the
1197/// structure of.
1198/// \param [in] CandB - The second candidate we are trying to determine the
1199/// structure of.
1200/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1201/// a circuit to the IRSimilarityCandidates that include this instruction.
1202/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1203/// number representing the structural group assigned to it.
1204static std::optional<
1205 std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1206CheckLargerCands(
1207 IRSimilarityCandidate &CandA, IRSimilarityCandidate &CandB,
1208 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1209 DenseMap<IRSimilarityCandidate *, unsigned> &CandToGroup) {
1210 DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandA;
1211 DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandB;
1212 DenseSet<unsigned> IncludedGroupsA;
1213 DenseSet<unsigned> IncludedGroupsB;
1214
1215 // Find the overall similarity group numbers that fully contain the candidate,
1216 // and record the larger candidate for each group.
1217 auto IdxToCandidateIt = IndexToIncludedCand.find(Val: CandA.getStartIdx());
1218 std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1219 Result;
1220
1221 unsigned CandAStart = CandA.getStartIdx();
1222 unsigned CandAEnd = CandA.getEndIdx();
1223 unsigned CandBStart = CandB.getStartIdx();
1224 unsigned CandBEnd = CandB.getEndIdx();
1225 if (IdxToCandidateIt == IndexToIncludedCand.end())
1226 return Result;
1227 for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1228 if (MatchedCand->getStartIdx() > CandAStart ||
1229 (MatchedCand->getEndIdx() < CandAEnd))
1230 continue;
1231 unsigned GroupNum = CandToGroup.find(Val: MatchedCand)->second;
1232 IncludedGroupAndCandA.insert(KV: std::make_pair(x&: GroupNum, y&: MatchedCand));
1233 IncludedGroupsA.insert(V: GroupNum);
1234 }
1235
1236 // Find the overall similarity group numbers that fully contain the next
1237 // candidate, and record the larger candidate for each group.
1238 IdxToCandidateIt = IndexToIncludedCand.find(Val: CandBStart);
1239 if (IdxToCandidateIt == IndexToIncludedCand.end())
1240 return Result;
1241 for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1242 if (MatchedCand->getStartIdx() > CandBStart ||
1243 MatchedCand->getEndIdx() < CandBEnd)
1244 continue;
1245 unsigned GroupNum = CandToGroup.find(Val: MatchedCand)->second;
1246 IncludedGroupAndCandB.insert(KV: std::make_pair(x&: GroupNum, y&: MatchedCand));
1247 IncludedGroupsB.insert(V: GroupNum);
1248 }
1249
1250 // Find the intersection between the two groups, these are the groups where
1251 // the larger candidates exist.
1252 set_intersect(S1&: IncludedGroupsA, S2: IncludedGroupsB);
1253
1254 // If there is no intersection between the sets, then we cannot determine
1255 // whether or not there is a match.
1256 if (IncludedGroupsA.empty())
1257 return Result;
1258
1259 // Create a pair that contains the larger candidates.
1260 auto ItA = IncludedGroupAndCandA.find(Val: *IncludedGroupsA.begin());
1261 auto ItB = IncludedGroupAndCandB.find(Val: *IncludedGroupsA.begin());
1262 Result = std::make_pair(x&: ItA->second, y&: ItB->second);
1263 return Result;
1264}
1265
1266/// From the list of IRSimilarityCandidates, perform a comparison between each
1267/// IRSimilarityCandidate to determine if there are overlapping
1268/// IRInstructionData, or if they do not have the same structure.
1269///
1270/// \param [in] CandsForRepSubstring - The vector containing the
1271/// IRSimilarityCandidates.
1272/// \param [out] StructuralGroups - the mapping of unsigned integers to vector
1273/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
1274/// vector are structurally similar to one another.
1275/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1276/// a circuit to the IRSimilarityCandidates that include this instruction.
1277/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1278/// number representing the structural group assigned to it.
1279static void findCandidateStructures(
1280 std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
1281 DenseMap<unsigned, SimilarityGroup> &StructuralGroups,
1282 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1283 DenseMap<IRSimilarityCandidate *, unsigned> &CandToOverallGroup
1284 ) {
1285 std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
1286 InnerCandEndIt;
1287
1288 // IRSimilarityCandidates each have a structure for operand use. It is
1289 // possible that two instances of the same subsequences have different
1290 // structure. Each type of structure found is assigned a number. This
1291 // DenseMap maps an IRSimilarityCandidate to which type of similarity
1292 // discovered it fits within.
1293 DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1294
1295 // Find the compatibility from each candidate to the others to determine
1296 // which candidates overlap and which have the same structure by mapping
1297 // each structure to a different group.
1298 bool SameStructure;
1299 bool Inserted;
1300 unsigned CurrentGroupNum = 0;
1301 unsigned OuterGroupNum;
1302 DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
1303 DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
1304 DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
1305
1306 // Iterate over the candidates to determine its structural and overlapping
1307 // compatibility with other instructions
1308 DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
1309 DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
1310 for (CandIt = CandsForRepSubstring.begin(),
1311 CandEndIt = CandsForRepSubstring.end();
1312 CandIt != CandEndIt; CandIt++) {
1313
1314 // Determine if it has an assigned structural group already.
1315 CandToGroupIt = CandToGroup.find(Val: &*CandIt);
1316 if (CandToGroupIt == CandToGroup.end()) {
1317 // If not, we assign it one, and add it to our mapping.
1318 std::tie(args&: CandToGroupIt, args&: Inserted) =
1319 CandToGroup.insert(KV: std::make_pair(x: &*CandIt, y: CurrentGroupNum++));
1320 }
1321
1322 // Get the structural group number from the iterator.
1323 OuterGroupNum = CandToGroupIt->second;
1324
1325 // Check if we already have a list of IRSimilarityCandidates for the current
1326 // structural group. Create one if one does not exist.
1327 CurrentGroupPair = StructuralGroups.find(Val: OuterGroupNum);
1328 if (CurrentGroupPair == StructuralGroups.end()) {
1329 IRSimilarityCandidate::createCanonicalMappingFor(CurrCand&: *CandIt);
1330 std::tie(args&: CurrentGroupPair, args&: Inserted) = StructuralGroups.insert(
1331 KV: std::make_pair(x&: OuterGroupNum, y: SimilarityGroup({*CandIt})));
1332 }
1333
1334 // Iterate over the IRSimilarityCandidates following the current
1335 // IRSimilarityCandidate in the list to determine whether the two
1336 // IRSimilarityCandidates are compatible. This is so we do not repeat pairs
1337 // of IRSimilarityCandidates.
1338 for (InnerCandIt = std::next(x: CandIt),
1339 InnerCandEndIt = CandsForRepSubstring.end();
1340 InnerCandIt != InnerCandEndIt; InnerCandIt++) {
1341
1342 // We check if the inner item has a group already, if it does, we skip it.
1343 CandToGroupItInner = CandToGroup.find(Val: &*InnerCandIt);
1344 if (CandToGroupItInner != CandToGroup.end())
1345 continue;
1346
1347 // Check if we have found structural similarity between two candidates
1348 // that fully contains the first and second candidates.
1349 std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1350 LargerPair = CheckLargerCands(
1351 CandA&: *CandIt, CandB&: *InnerCandIt, IndexToIncludedCand, CandToGroup&: CandToOverallGroup);
1352
1353 // If a pair was found, it means that we can assume that these smaller
1354 // substrings are also structurally similar. Use the larger candidates to
1355 // determine the canonical mapping between the two sections.
1356 if (LargerPair.has_value()) {
1357 SameStructure = true;
1358 InnerCandIt->createCanonicalRelationFrom(
1359 SourceCand&: *CandIt, SourceCandLarge&: *LargerPair.value().first, TargetCandLarge&: *LargerPair.value().second);
1360 CandToGroup.insert(KV: std::make_pair(x: &*InnerCandIt, y&: OuterGroupNum));
1361 CurrentGroupPair->second.push_back(x: *InnerCandIt);
1362 continue;
1363 }
1364
1365 // Otherwise we determine if they have the same structure and add it to
1366 // vector if they match.
1367 ValueNumberMappingA.clear();
1368 ValueNumberMappingB.clear();
1369 SameStructure = IRSimilarityCandidate::compareStructure(
1370 A: *CandIt, B: *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
1371 if (!SameStructure)
1372 continue;
1373
1374 InnerCandIt->createCanonicalRelationFrom(SourceCand&: *CandIt, ToSourceMapping&: ValueNumberMappingA,
1375 FromSourceMapping&: ValueNumberMappingB);
1376 CandToGroup.insert(KV: std::make_pair(x: &*InnerCandIt, y&: OuterGroupNum));
1377 CurrentGroupPair->second.push_back(x: *InnerCandIt);
1378 }
1379 }
1380}
1381
1382void IRSimilarityIdentifier::findCandidates(
1383 std::vector<IRInstructionData *> &InstrList,
1384 std::vector<unsigned> &IntegerMapping) {
1385 SuffixTree ST(IntegerMapping);
1386
1387 std::vector<IRSimilarityCandidate> CandsForRepSubstring;
1388 std::vector<SimilarityGroup> NewCandidateGroups;
1389
1390 DenseMap<unsigned, SimilarityGroup> StructuralGroups;
1391 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> IndexToIncludedCand;
1392 DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1393
1394 // Iterate over the subsequences found by the Suffix Tree to create
1395 // IRSimilarityCandidates for each repeated subsequence and determine which
1396 // instances are structurally similar to one another.
1397
1398 // Sort the suffix tree from longest substring to shortest.
1399 std::vector<SuffixTree::RepeatedSubstring> RSes;
1400 for (SuffixTree::RepeatedSubstring &RS : ST)
1401 RSes.push_back(x: RS);
1402
1403 llvm::stable_sort(Range&: RSes, C: [](const SuffixTree::RepeatedSubstring &LHS,
1404 const SuffixTree::RepeatedSubstring &RHS) {
1405 return LHS.Length > RHS.Length;
1406 });
1407 for (SuffixTree::RepeatedSubstring &RS : RSes) {
1408 createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS,
1409 CandsForRepSubstring);
1410
1411 if (CandsForRepSubstring.size() < 2)
1412 continue;
1413
1414 findCandidateStructures(CandsForRepSubstring, StructuralGroups,
1415 IndexToIncludedCand, CandToOverallGroup&: CandToGroup);
1416 for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups) {
1417 // We only add the group if it contains more than one
1418 // IRSimilarityCandidate. If there is only one, that means there is no
1419 // other repeated subsequence with the same structure.
1420 if (Group.second.size() > 1) {
1421 SimilarityCandidates->push_back(x: Group.second);
1422 // Iterate over each candidate in the group, and add an entry for each
1423 // instruction included with a mapping to a set of
1424 // IRSimilarityCandidates that include that instruction.
1425 for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) {
1426 for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx();
1427 Idx <= Edx; ++Idx) {
1428 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>>::iterator
1429 IdIt;
1430 IdIt = IndexToIncludedCand.find(Val: Idx);
1431 bool Inserted = false;
1432 if (IdIt == IndexToIncludedCand.end())
1433 std::tie(args&: IdIt, args&: Inserted) = IndexToIncludedCand.insert(
1434 KV: std::make_pair(x&: Idx, y: DenseSet<IRSimilarityCandidate *>()));
1435 IdIt->second.insert(V: &IRCand);
1436 }
1437 // Add mapping of candidate to the overall similarity group number.
1438 CandToGroup.insert(
1439 KV: std::make_pair(x: &IRCand, y: SimilarityCandidates->size() - 1));
1440 }
1441 }
1442 }
1443
1444 CandsForRepSubstring.clear();
1445 StructuralGroups.clear();
1446 NewCandidateGroups.clear();
1447 }
1448}
1449
1450SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
1451 ArrayRef<std::unique_ptr<Module>> Modules) {
1452 resetSimilarityCandidates();
1453
1454 std::vector<IRInstructionData *> InstrList;
1455 std::vector<unsigned> IntegerMapping;
1456 Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1457 Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1458 Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1459 Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1460 Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1461
1462 populateMapper(Modules, InstrList, IntegerMapping);
1463 findCandidates(InstrList, IntegerMapping);
1464
1465 return *SimilarityCandidates;
1466}
1467
1468SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
1469 resetSimilarityCandidates();
1470 Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1471 Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1472 Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1473 Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1474 Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1475
1476 std::vector<IRInstructionData *> InstrList;
1477 std::vector<unsigned> IntegerMapping;
1478
1479 populateMapper(M, InstrList, IntegerMapping);
1480 findCandidates(InstrList, IntegerMapping);
1481
1482 return *SimilarityCandidates;
1483}
1484
1485INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
1486 "ir-similarity-identifier", false, true)
1487
1488IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
1489 : ModulePass(ID) {
1490 initializeIRSimilarityIdentifierWrapperPassPass(
1491 Registry&: *PassRegistry::getPassRegistry());
1492}
1493
1494bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
1495 IRSI.reset(p: new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1496 MatchCallsByName, !DisableIntrinsics,
1497 false));
1498 return false;
1499}
1500
1501bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
1502 IRSI.reset();
1503 return false;
1504}
1505
1506bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
1507 IRSI->findSimilarity(M);
1508 return false;
1509}
1510
1511AnalysisKey IRSimilarityAnalysis::Key;
1512IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
1513 ModuleAnalysisManager &) {
1514 auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1515 MatchCallsByName, !DisableIntrinsics,
1516 false);
1517 IRSI.findSimilarity(M);
1518 return IRSI;
1519}
1520
1521PreservedAnalyses
1522IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
1523 IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(IR&: M);
1524 std::optional<SimilarityGroupList> &SimilarityCandidatesOpt =
1525 IRSI.getSimilarity();
1526
1527 for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
1528 OS << CandVec.size() << " candidates of length "
1529 << CandVec.begin()->getLength() << ". Found in: \n";
1530 for (IRSimilarityCandidate &Cand : CandVec) {
1531 OS << " Function: " << Cand.front()->Inst->getFunction()->getName().str()
1532 << ", Basic Block: ";
1533 if (Cand.front()->Inst->getParent()->getName().str() == "")
1534 OS << "(unnamed)";
1535 else
1536 OS << Cand.front()->Inst->getParent()->getName().str();
1537 OS << "\n Start Instruction: ";
1538 Cand.frontInstruction()->print(O&: OS);
1539 OS << "\n End Instruction: ";
1540 Cand.backInstruction()->print(O&: OS);
1541 OS << "\n";
1542 }
1543 }
1544
1545 return PreservedAnalyses::all();
1546}
1547
1548char IRSimilarityIdentifierWrapperPass::ID = 0;
1549