1//===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// Implementation file for the IRSimilarityIdentifier for identifying
11// similarities in IR including the IRInstructionMapper.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Analysis/IRSimilarityIdentifier.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/SetOperations.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/Operator.h"
20#include "llvm/IR/User.h"
21#include "llvm/InitializePasses.h"
22#include "llvm/Support/SuffixTree.h"
23
24using namespace llvm;
25using namespace IRSimilarity;
26
27namespace llvm {
28cl::opt<bool>
29 DisableBranches("no-ir-sim-branch-matching", cl::init(Val: false),
30 cl::ReallyHidden,
31 cl::desc("disable similarity matching, and outlining, "
32 "across branches for debugging purposes."));
33
34cl::opt<bool>
35 DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(Val: false),
36 cl::ReallyHidden,
37 cl::desc("disable outlining indirect calls."));
38
39static cl::opt<bool>
40 MatchCallsByName("ir-sim-calls-by-name", cl::init(Val: false), cl::ReallyHidden,
41 cl::desc("only allow matching call instructions if the "
42 "name and type signature match."));
43
44cl::opt<bool>
45 DisableIntrinsics("no-ir-sim-intrinsics", cl::init(Val: false), cl::ReallyHidden,
46 cl::desc("Don't match or outline intrinsics"));
47} // namespace llvm
48
49IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
50 IRInstructionDataList &IDList)
51 : Inst(&I), Legal(Legality), IDL(&IDList) {
52 initializeInstruction();
53}
54
55void IRInstructionData::initializeInstruction() {
56 // We check for whether we have a comparison instruction. If it is, we
57 // find the "less than" version of the predicate for consistency for
58 // comparison instructions throught the program.
59 if (CmpInst *C = dyn_cast<CmpInst>(Val: Inst)) {
60 CmpInst::Predicate Predicate = predicateForConsistency(CI: C);
61 if (Predicate != C->getPredicate())
62 RevisedPredicate = Predicate;
63 }
64
65 // Here we collect the operands and their types for determining whether
66 // the structure of the operand use matches between two different candidates.
67 for (Use &OI : Inst->operands()) {
68 if (isa<CmpInst>(Val: Inst) && RevisedPredicate) {
69 // If we have a CmpInst where the predicate is reversed, it means the
70 // operands must be reversed as well.
71 OperVals.insert(I: OperVals.begin(), Elt: OI.get());
72 continue;
73 }
74
75 OperVals.push_back(Elt: OI.get());
76 }
77
78 // We capture the incoming BasicBlocks as values as well as the incoming
79 // Values in order to check for structural similarity.
80 if (PHINode *PN = dyn_cast<PHINode>(Val: Inst))
81 llvm::append_range(C&: OperVals, R: PN->blocks());
82}
83
84IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
85 : IDL(&IDList) {}
86
87void IRInstructionData::setBranchSuccessors(
88 DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
89 assert(isa<BranchInst>(Inst) && "Instruction must be branch");
90
91 BranchInst *BI = cast<BranchInst>(Val: Inst);
92 DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
93
94 BBNumIt = BasicBlockToInteger.find(Val: BI->getParent());
95 assert(BBNumIt != BasicBlockToInteger.end() &&
96 "Could not find location for BasicBlock!");
97
98 int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
99
100 for (Value *V : getBlockOperVals()) {
101 BasicBlock *Successor = cast<BasicBlock>(Val: V);
102 BBNumIt = BasicBlockToInteger.find(Val: Successor);
103 assert(BBNumIt != BasicBlockToInteger.end() &&
104 "Could not find number for BasicBlock!");
105 int OtherBlockNumber = static_cast<int>(BBNumIt->second);
106
107 int Relative = OtherBlockNumber - CurrentBlockNumber;
108 RelativeBlockLocations.push_back(Elt: Relative);
109 }
110}
111
112ArrayRef<Value *> IRInstructionData::getBlockOperVals() {
113 assert((isa<BranchInst>(Inst) ||
114 isa<PHINode>(Inst)) && "Instruction must be branch or PHINode");
115
116 if (BranchInst *BI = dyn_cast<BranchInst>(Val: Inst))
117 return ArrayRef<Value *>(
118 std::next(x: OperVals.begin(), n: BI->isConditional() ? 1 : 0),
119 OperVals.end()
120 );
121
122 if (PHINode *PN = dyn_cast<PHINode>(Val: Inst))
123 return ArrayRef<Value *>(
124 std::next(x: OperVals.begin(), n: PN->getNumIncomingValues()),
125 OperVals.end()
126 );
127
128 return ArrayRef<Value *>();
129}
130
131void IRInstructionData::setCalleeName(bool MatchByName) {
132 CallInst *CI = dyn_cast<CallInst>(Val: Inst);
133 assert(CI && "Instruction must be call");
134
135 CalleeName = "";
136 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
137 // To hash intrinsics, we use the opcode, and types like the other
138 // instructions, but also, the Intrinsic ID, and the Name of the
139 // intrinsic.
140 Intrinsic::ID IntrinsicID = II->getIntrinsicID();
141 FunctionType *FT = II->getFunctionType();
142 // If there is an overloaded name, we have to use the complex version
143 // of getName to get the entire string.
144 if (Intrinsic::isOverloaded(id: IntrinsicID))
145 CalleeName =
146 Intrinsic::getName(Id: IntrinsicID, Tys: FT->params(), M: II->getModule(), FT);
147 // If there is not an overloaded name, we only need to use this version.
148 else
149 CalleeName = Intrinsic::getName(id: IntrinsicID).str();
150
151 return;
152 }
153
154 if (!CI->isIndirectCall() && MatchByName)
155 CalleeName = CI->getCalledFunction()->getName().str();
156}
157
158void IRInstructionData::setPHIPredecessors(
159 DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
160 assert(isa<PHINode>(Inst) && "Instruction must be phi node");
161
162 PHINode *PN = cast<PHINode>(Val: Inst);
163 DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
164
165 BBNumIt = BasicBlockToInteger.find(Val: PN->getParent());
166 assert(BBNumIt != BasicBlockToInteger.end() &&
167 "Could not find location for BasicBlock!");
168
169 int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
170
171 // Convert the incoming blocks of the PHINode to an integer value, based on
172 // the relative distances between the current block and the incoming block.
173 for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
174 BasicBlock *Incoming = PN->getIncomingBlock(i: Idx);
175 BBNumIt = BasicBlockToInteger.find(Val: Incoming);
176 assert(BBNumIt != BasicBlockToInteger.end() &&
177 "Could not find number for BasicBlock!");
178 int OtherBlockNumber = static_cast<int>(BBNumIt->second);
179
180 int Relative = OtherBlockNumber - CurrentBlockNumber;
181 RelativeBlockLocations.push_back(Elt: Relative);
182 }
183}
184
185CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
186 switch (CI->getPredicate()) {
187 case CmpInst::FCMP_OGT:
188 case CmpInst::FCMP_UGT:
189 case CmpInst::FCMP_OGE:
190 case CmpInst::FCMP_UGE:
191 case CmpInst::ICMP_SGT:
192 case CmpInst::ICMP_UGT:
193 case CmpInst::ICMP_SGE:
194 case CmpInst::ICMP_UGE:
195 return CI->getSwappedPredicate();
196 default:
197 return CI->getPredicate();
198 }
199}
200
201CmpInst::Predicate IRInstructionData::getPredicate() const {
202 assert(isa<CmpInst>(Inst) &&
203 "Can only get a predicate from a compare instruction");
204
205 if (RevisedPredicate)
206 return *RevisedPredicate;
207
208 return cast<CmpInst>(Val: Inst)->getPredicate();
209}
210
211StringRef IRInstructionData::getCalleeName() const {
212 assert(isa<CallInst>(Inst) &&
213 "Can only get a name from a call instruction");
214
215 assert(CalleeName && "CalleeName has not been set");
216
217 return *CalleeName;
218}
219
220bool IRSimilarity::isClose(const IRInstructionData &A,
221 const IRInstructionData &B) {
222
223 if (!A.Legal || !B.Legal)
224 return false;
225
226 // Check if we are performing the same sort of operation on the same types
227 // but not on the same values.
228 if (!A.Inst->isSameOperationAs(I: B.Inst)) {
229 // If there is a predicate, this means that either there is a swapped
230 // predicate, or that the types are different, we want to make sure that
231 // the predicates are equivalent via swapping.
232 if (isa<CmpInst>(Val: A.Inst) && isa<CmpInst>(Val: B.Inst)) {
233
234 if (A.getPredicate() != B.getPredicate())
235 return false;
236
237 // If the predicates are the same via swap, make sure that the types are
238 // still the same.
239 auto ZippedTypes = zip(t: A.OperVals, u: B.OperVals);
240
241 return all_of(
242 Range&: ZippedTypes, P: [](std::tuple<llvm::Value *, llvm::Value *> R) {
243 return std::get<0>(t&: R)->getType() == std::get<1>(t&: R)->getType();
244 });
245 }
246
247 return false;
248 }
249
250 // Since any GEP Instruction operands after the first operand cannot be
251 // defined by a register, we must make sure that the operands after the first
252 // are the same in the two instructions
253 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: A.Inst)) {
254 auto *OtherGEP = cast<GetElementPtrInst>(Val: B.Inst);
255
256 // If the instructions do not have the same inbounds restrictions, we do
257 // not consider them the same.
258 if (GEP->isInBounds() != OtherGEP->isInBounds())
259 return false;
260
261 auto ZippedOperands = zip(t: GEP->indices(), u: OtherGEP->indices());
262
263 // We increment here since we do not care about the first instruction,
264 // we only care about the following operands since they must be the
265 // exact same to be considered similar.
266 return all_of(Range: drop_begin(RangeOrContainer&: ZippedOperands),
267 P: [](std::tuple<llvm::Use &, llvm::Use &> R) {
268 return std::get<0>(t&: R) == std::get<1>(t&: R);
269 });
270 }
271
272 // If the instructions are functions calls, we make sure that the function
273 // name is the same. We already know that the types are since is
274 // isSameOperationAs is true.
275 if (isa<CallInst>(Val: A.Inst) && isa<CallInst>(Val: B.Inst)) {
276 if (A.getCalleeName() != B.getCalleeName())
277 return false;
278 }
279
280 if (isa<BranchInst>(Val: A.Inst) && isa<BranchInst>(Val: B.Inst) &&
281 A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
282 return false;
283
284 return true;
285}
286
287// TODO: This is the same as the MachineOutliner, and should be consolidated
288// into the same interface.
289void IRInstructionMapper::convertToUnsignedVec(
290 BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
291 std::vector<unsigned> &IntegerMapping) {
292 BasicBlock::iterator It = BB.begin();
293
294 std::vector<unsigned> IntegerMappingForBB;
295 std::vector<IRInstructionData *> InstrListForBB;
296
297 for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
298 switch (InstClassifier.visit(I&: *It)) {
299 case InstrType::Legal:
300 mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
301 break;
302 case InstrType::Illegal:
303 mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
304 break;
305 case InstrType::Invisible:
306 AddedIllegalLastTime = false;
307 break;
308 }
309 }
310
311 if (AddedIllegalLastTime)
312 mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, End: true);
313 for (IRInstructionData *ID : InstrListForBB)
314 this->IDL->push_back(Node&: *ID);
315 llvm::append_range(C&: InstrList, R&: InstrListForBB);
316 llvm::append_range(C&: IntegerMapping, R&: IntegerMappingForBB);
317}
318
319// TODO: This is the same as the MachineOutliner, and should be consolidated
320// into the same interface.
321unsigned IRInstructionMapper::mapToLegalUnsigned(
322 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
323 std::vector<IRInstructionData *> &InstrListForBB) {
324 // We added something legal, so we should unset the AddedLegalLastTime
325 // flag.
326 AddedIllegalLastTime = false;
327
328 // If we have at least two adjacent legal instructions (which may have
329 // invisible instructions in between), remember that.
330 if (CanCombineWithPrevInstr)
331 HaveLegalRange = true;
332 CanCombineWithPrevInstr = true;
333
334 // Get the integer for this instruction or give it the current
335 // LegalInstrNumber.
336 IRInstructionData *ID = allocateIRInstructionData(I&: *It, Legality: true, IDL&: *IDL);
337 InstrListForBB.push_back(x: ID);
338
339 if (isa<BranchInst>(Val: *It))
340 ID->setBranchSuccessors(BasicBlockToInteger);
341
342 if (isa<CallInst>(Val: *It))
343 ID->setCalleeName(EnableMatchCallsByName);
344
345 if (isa<PHINode>(Val: *It))
346 ID->setPHIPredecessors(BasicBlockToInteger);
347
348 // Add to the instruction list
349 bool WasInserted;
350 DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
351 ResultIt;
352 std::tie(args&: ResultIt, args&: WasInserted) =
353 InstructionIntegerMap.insert(KV: std::make_pair(x&: ID, y&: LegalInstrNumber));
354 unsigned INumber = ResultIt->second;
355
356 // There was an insertion.
357 if (WasInserted)
358 LegalInstrNumber++;
359
360 IntegerMappingForBB.push_back(x: INumber);
361
362 // Make sure we don't overflow or use any integers reserved by the DenseMap.
363 assert(LegalInstrNumber < IllegalInstrNumber &&
364 "Instruction mapping overflow!");
365
366 assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
367 "Tried to assign DenseMap tombstone or empty key to instruction.");
368 assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
369 "Tried to assign DenseMap tombstone or empty key to instruction.");
370
371 return INumber;
372}
373
374IRInstructionData *
375IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
376 IRInstructionDataList &IDL) {
377 return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
378}
379
380IRInstructionData *
381IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) {
382 return new (InstDataAllocator->Allocate()) IRInstructionData(IDL);
383}
384
385IRInstructionDataList *
386IRInstructionMapper::allocateIRInstructionDataList() {
387 return new (IDLAllocator->Allocate()) IRInstructionDataList();
388}
389
390// TODO: This is the same as the MachineOutliner, and should be consolidated
391// into the same interface.
392unsigned IRInstructionMapper::mapToIllegalUnsigned(
393 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
394 std::vector<IRInstructionData *> &InstrListForBB, bool End) {
395 // Can't combine an illegal instruction. Set the flag.
396 CanCombineWithPrevInstr = false;
397
398 // Only add one illegal number per range of legal numbers.
399 if (AddedIllegalLastTime)
400 return IllegalInstrNumber;
401
402 IRInstructionData *ID = nullptr;
403 if (!End)
404 ID = allocateIRInstructionData(I&: *It, Legality: false, IDL&: *IDL);
405 else
406 ID = allocateIRInstructionData(IDL&: *IDL);
407 InstrListForBB.push_back(x: ID);
408
409 // Remember that we added an illegal number last time.
410 AddedIllegalLastTime = true;
411 unsigned INumber = IllegalInstrNumber;
412 IntegerMappingForBB.push_back(x: IllegalInstrNumber--);
413
414 assert(LegalInstrNumber < IllegalInstrNumber &&
415 "Instruction mapping overflow!");
416
417 assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
418 "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
419
420 assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
421 "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
422
423 return INumber;
424}
425
426IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
427 IRInstructionData *FirstInstIt,
428 IRInstructionData *LastInstIt)
429 : StartIdx(StartIdx), Len(Len) {
430
431 assert(FirstInstIt != nullptr && "Instruction is nullptr!");
432 assert(LastInstIt != nullptr && "Instruction is nullptr!");
433 assert(StartIdx + Len > StartIdx &&
434 "Overflow for IRSimilarityCandidate range?");
435 assert(Len - 1 == static_cast<unsigned>(std::distance(
436 iterator(FirstInstIt), iterator(LastInstIt))) &&
437 "Length of the first and last IRInstructionData do not match the "
438 "given length");
439
440 // We iterate over the given instructions, and map each unique value
441 // to a unique number in the IRSimilarityCandidate ValueToNumber and
442 // NumberToValue maps. A constant get its own value globally, the individual
443 // uses of the constants are not considered to be unique.
444 //
445 // IR: Mapping Added:
446 // %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
447 // %add2 = add i32 %a, %1 %add2 -> 4
448 // %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
449 //
450 // when replace with global values, starting from 1, would be
451 //
452 // 3 = add i32 1, 2
453 // 4 = add i32 1, 3
454 // 6 = add i32 5, 2
455 unsigned LocalValNumber = 1;
456 IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
457 for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
458 // Map the operand values to an unsigned integer if it does not already
459 // have an unsigned integer assigned to it.
460 for (Value *Arg : ID->OperVals)
461 if (ValueToNumber.try_emplace(Key: Arg, Args&: LocalValNumber).second) {
462 NumberToValue.try_emplace(Key: LocalValNumber, Args&: Arg);
463 LocalValNumber++;
464 }
465
466 // Mapping the instructions to an unsigned integer if it is not already
467 // exist in the mapping.
468 if (ValueToNumber.try_emplace(Key: ID->Inst, Args&: LocalValNumber).second) {
469 NumberToValue.try_emplace(Key: LocalValNumber, Args&: ID->Inst);
470 LocalValNumber++;
471 }
472 }
473
474 // Setting the first and last instruction data pointers for the candidate. If
475 // we got through the entire for loop without hitting an assert, we know
476 // that both of these instructions are not nullptrs.
477 FirstInst = FirstInstIt;
478 LastInst = LastInstIt;
479
480 // Add the basic blocks contained in the set into the global value numbering.
481 DenseSet<BasicBlock *> BBSet;
482 getBasicBlocks(BBSet);
483 for (BasicBlock *BB : BBSet) {
484 if (ValueToNumber.try_emplace(Key: BB, Args&: LocalValNumber).second) {
485 NumberToValue.try_emplace(Key: LocalValNumber, Args&: BB);
486 LocalValNumber++;
487 }
488 }
489}
490
491bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
492 const IRSimilarityCandidate &B) {
493 if (A.getLength() != B.getLength())
494 return false;
495
496 auto InstrDataForBoth =
497 zip(t: make_range(x: A.begin(), y: A.end()), u: make_range(x: B.begin(), y: B.end()));
498
499 return all_of(Range&: InstrDataForBoth,
500 P: [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
501 IRInstructionData &A = std::get<0>(t&: R);
502 IRInstructionData &B = std::get<1>(t&: R);
503 if (!A.Legal || !B.Legal)
504 return false;
505 return isClose(A, B);
506 });
507}
508
509/// Determine if one or more of the assigned global value numbers for the
510/// operands in \p TargetValueNumbers is in the current mapping set for operand
511/// numbers in \p SourceOperands. The set of possible corresponding global
512/// value numbers are replaced with the most recent version of compatible
513/// values.
514///
515/// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
516/// value number for the source IRInstructionCandidate.
517/// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
518/// IRSimilarityCandidate global value numbers to a set of possible numbers in
519/// the target.
520/// \param [in] SourceOperands - The operands in the original
521/// IRSimilarityCandidate in the current instruction.
522/// \param [in] TargetValueNumbers - The global value numbers of the operands in
523/// the corresponding Instruction in the other IRSimilarityCandidate.
524/// \returns true if there exists a possible mapping between the source
525/// Instruction operands and the target Instruction operands, and false if not.
526static bool checkNumberingAndReplaceCommutative(
527 const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
528 DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
529 ArrayRef<Value *> &SourceOperands,
530 DenseSet<unsigned> &TargetValueNumbers){
531
532 DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
533
534 unsigned ArgVal;
535 bool WasInserted;
536
537 // Iterate over the operands in the source IRSimilarityCandidate to determine
538 // whether there exists an operand in the other IRSimilarityCandidate that
539 // creates a valid mapping of Value to Value between the
540 // IRSimilarityCaniddates.
541 for (Value *V : SourceOperands) {
542 ArgVal = SourceValueToNumberMapping.find(Val: V)->second;
543
544 // Instead of finding a current mapping, we attempt to insert a set.
545 std::tie(args&: ValueMappingIt, args&: WasInserted) = CurrentSrcTgtNumberMapping.insert(
546 KV: std::make_pair(x&: ArgVal, y&: TargetValueNumbers));
547
548 // We need to iterate over the items in other IRSimilarityCandidate's
549 // Instruction to determine whether there is a valid mapping of
550 // Value to Value.
551 DenseSet<unsigned> NewSet;
552 for (unsigned &Curr : ValueMappingIt->second)
553 // If we can find the value in the mapping, we add it to the new set.
554 if (TargetValueNumbers.contains(V: Curr))
555 NewSet.insert(V: Curr);
556
557 // If we could not find a Value, return 0.
558 if (NewSet.empty())
559 return false;
560
561 // Otherwise replace the old mapping with the newly constructed one.
562 if (NewSet.size() != ValueMappingIt->second.size())
563 ValueMappingIt->second.swap(RHS&: NewSet);
564
565 // We have reached no conclusions about the mapping, and cannot remove
566 // any items from the other operands, so we move to check the next operand.
567 if (ValueMappingIt->second.size() != 1)
568 continue;
569
570 unsigned ValToRemove = *ValueMappingIt->second.begin();
571 // When there is only one item left in the mapping for and operand, remove
572 // the value from the other operands. If it results in there being no
573 // mapping, return false, it means the mapping is wrong
574 for (Value *InnerV : SourceOperands) {
575 if (V == InnerV)
576 continue;
577
578 unsigned InnerVal = SourceValueToNumberMapping.find(Val: InnerV)->second;
579 ValueMappingIt = CurrentSrcTgtNumberMapping.find(Val: InnerVal);
580 if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
581 continue;
582
583 ValueMappingIt->second.erase(V: ValToRemove);
584 if (ValueMappingIt->second.empty())
585 return false;
586 }
587 }
588
589 return true;
590}
591
592/// Determine if operand number \p TargetArgVal is in the current mapping set
593/// for operand number \p SourceArgVal.
594///
595/// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
596/// value numbers from source IRSimilarityCandidate to target
597/// IRSimilarityCandidate.
598/// \param [in] SourceArgVal The global value number for an operand in the
599/// in the original candidate.
600/// \param [in] TargetArgVal The global value number for the corresponding
601/// operand in the other candidate.
602/// \returns True if there exists a mapping and false if not.
603bool checkNumberingAndReplace(
604 DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
605 unsigned SourceArgVal, unsigned TargetArgVal) {
606 // We are given two unsigned integers representing the global values of
607 // the operands in different IRSimilarityCandidates and a current mapping
608 // between the two.
609 //
610 // Source Operand GVN: 1
611 // Target Operand GVN: 2
612 // CurrentMapping: {1: {1, 2}}
613 //
614 // Since we have mapping, and the target operand is contained in the set, we
615 // update it to:
616 // CurrentMapping: {1: {2}}
617 // and can return true. But, if the mapping was
618 // CurrentMapping: {1: {3}}
619 // we would return false.
620
621 bool WasInserted;
622 DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
623
624 std::tie(args&: Val, args&: WasInserted) = CurrentSrcTgtNumberMapping.insert(
625 KV: std::make_pair(x&: SourceArgVal, y: DenseSet<unsigned>({TargetArgVal})));
626
627 // If we created a new mapping, then we are done.
628 if (WasInserted)
629 return true;
630
631 // If there is more than one option in the mapping set, and the target value
632 // is included in the mapping set replace that set with one that only includes
633 // the target value, as it is the only valid mapping via the non commutative
634 // instruction.
635
636 DenseSet<unsigned> &TargetSet = Val->second;
637 if (TargetSet.size() > 1 && TargetSet.contains(V: TargetArgVal)) {
638 TargetSet.clear();
639 TargetSet.insert(V: TargetArgVal);
640 return true;
641 }
642
643 // Return true if we can find the value in the set.
644 return TargetSet.contains(V: TargetArgVal);
645}
646
647bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
648 OperandMapping A, OperandMapping B) {
649 // Iterators to keep track of where we are in the operands for each
650 // Instruction.
651 ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
652 ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
653 unsigned OperandLength = A.OperVals.size();
654
655 // For each operand, get the value numbering and ensure it is consistent.
656 for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
657 unsigned OperValA = A.IRSC.ValueToNumber.find(Val: *VItA)->second;
658 unsigned OperValB = B.IRSC.ValueToNumber.find(Val: *VItB)->second;
659
660 // Attempt to add a set with only the target value. If there is no mapping
661 // we can create it here.
662 //
663 // For an instruction like a subtraction:
664 // IRSimilarityCandidateA: IRSimilarityCandidateB:
665 // %resultA = sub %a, %b %resultB = sub %d, %e
666 //
667 // We map %a -> %d and %b -> %e.
668 //
669 // And check to see whether their mapping is consistent in
670 // checkNumberingAndReplace.
671
672 if (!checkNumberingAndReplace(CurrentSrcTgtNumberMapping&: A.ValueNumberMapping, SourceArgVal: OperValA, TargetArgVal: OperValB))
673 return false;
674
675 if (!checkNumberingAndReplace(CurrentSrcTgtNumberMapping&: B.ValueNumberMapping, SourceArgVal: OperValB, TargetArgVal: OperValA))
676 return false;
677 }
678 return true;
679}
680
681bool IRSimilarityCandidate::compareCommutativeOperandMapping(
682 OperandMapping A, OperandMapping B) {
683 DenseSet<unsigned> ValueNumbersA;
684 DenseSet<unsigned> ValueNumbersB;
685
686 ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
687 ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
688 unsigned OperandLength = A.OperVals.size();
689
690 // Find the value number sets for the operands.
691 for (unsigned Idx = 0; Idx < OperandLength;
692 Idx++, VItA++, VItB++) {
693 ValueNumbersA.insert(V: A.IRSC.ValueToNumber.find(Val: *VItA)->second);
694 ValueNumbersB.insert(V: B.IRSC.ValueToNumber.find(Val: *VItB)->second);
695 }
696
697 // Iterate over the operands in the first IRSimilarityCandidate and make sure
698 // there exists a possible mapping with the operands in the second
699 // IRSimilarityCandidate.
700 if (!checkNumberingAndReplaceCommutative(SourceValueToNumberMapping: A.IRSC.ValueToNumber,
701 CurrentSrcTgtNumberMapping&: A.ValueNumberMapping, SourceOperands&: A.OperVals,
702 TargetValueNumbers&: ValueNumbersB))
703 return false;
704
705 // Iterate over the operands in the second IRSimilarityCandidate and make sure
706 // there exists a possible mapping with the operands in the first
707 // IRSimilarityCandidate.
708 if (!checkNumberingAndReplaceCommutative(SourceValueToNumberMapping: B.IRSC.ValueToNumber,
709 CurrentSrcTgtNumberMapping&: B.ValueNumberMapping, SourceOperands&: B.OperVals,
710 TargetValueNumbers&: ValueNumbersA))
711 return false;
712
713 return true;
714}
715
716bool IRSimilarityCandidate::compareAssignmentMapping(
717 const unsigned InstValA, const unsigned &InstValB,
718 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
719 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
720 DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
721 bool WasInserted;
722 std::tie(args&: ValueMappingIt, args&: WasInserted) = ValueNumberMappingA.insert(
723 KV: std::make_pair(x: InstValA, y: DenseSet<unsigned>({InstValB})));
724 if (!WasInserted && !ValueMappingIt->second.contains(V: InstValB))
725 return false;
726 else if (ValueMappingIt->second.size() != 1) {
727 for (unsigned OtherVal : ValueMappingIt->second) {
728 if (OtherVal == InstValB)
729 continue;
730 auto OtherValIt = ValueNumberMappingA.find(Val: OtherVal);
731 if (OtherValIt == ValueNumberMappingA.end())
732 continue;
733 OtherValIt->second.erase(V: InstValA);
734 }
735 ValueNumberMappingA.erase(I: ValueMappingIt);
736 std::tie(args&: ValueMappingIt, args&: WasInserted) = ValueNumberMappingA.insert(
737 KV: std::make_pair(x: InstValA, y: DenseSet<unsigned>({InstValB})));
738 }
739
740 return true;
741}
742
743bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A,
744 RelativeLocMapping B) {
745 // Get the basic blocks the label refers to.
746 BasicBlock *ABB = cast<BasicBlock>(Val: A.OperVal);
747 BasicBlock *BBB = cast<BasicBlock>(Val: B.OperVal);
748
749 // Get the basic blocks contained in each region.
750 DenseSet<BasicBlock *> BasicBlockA;
751 DenseSet<BasicBlock *> BasicBlockB;
752 A.IRSC.getBasicBlocks(BBSet&: BasicBlockA);
753 B.IRSC.getBasicBlocks(BBSet&: BasicBlockB);
754
755 // Determine if the block is contained in the region.
756 bool AContained = BasicBlockA.contains(V: ABB);
757 bool BContained = BasicBlockB.contains(V: BBB);
758
759 // Both blocks need to be contained in the region, or both need to be outside
760 // the region.
761 if (AContained != BContained)
762 return false;
763
764 // If both are contained, then we need to make sure that the relative
765 // distance to the target blocks are the same.
766 if (AContained)
767 return A.RelativeLocation == B.RelativeLocation;
768 return true;
769}
770
771bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
772 const IRSimilarityCandidate &B) {
773 DenseMap<unsigned, DenseSet<unsigned>> MappingA;
774 DenseMap<unsigned, DenseSet<unsigned>> MappingB;
775 return IRSimilarityCandidate::compareStructure(A, B, ValueNumberMappingA&: MappingA, ValueNumberMappingB&: MappingB);
776}
777
778typedef detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
779 SmallVector<int, 4> &, ArrayRef<Value *> &,
780 ArrayRef<Value *> &>
781 ZippedRelativeLocationsT;
782
783bool IRSimilarityCandidate::compareStructure(
784 const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
785 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
786 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
787 if (A.getLength() != B.getLength())
788 return false;
789
790 if (A.ValueToNumber.size() != B.ValueToNumber.size())
791 return false;
792
793 iterator ItA = A.begin();
794 iterator ItB = B.begin();
795
796 // These ValueNumber Mapping sets create a create a mapping between the values
797 // in one candidate to values in the other candidate. If we create a set with
798 // one element, and that same element maps to the original element in the
799 // candidate we have a good mapping.
800
801 // Iterate over the instructions contained in each candidate
802 unsigned SectionLength = A.getStartIdx() + A.getLength();
803 for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
804 ItA++, ItB++, Loc++) {
805 // Make sure the instructions are similar to one another.
806 if (!isClose(A: *ItA, B: *ItB))
807 return false;
808
809 Instruction *IA = ItA->Inst;
810 Instruction *IB = ItB->Inst;
811
812 if (!ItA->Legal || !ItB->Legal)
813 return false;
814
815 // Get the operand sets for the instructions.
816 ArrayRef<Value *> OperValsA = ItA->OperVals;
817 ArrayRef<Value *> OperValsB = ItB->OperVals;
818
819 unsigned InstValA = A.ValueToNumber.find(Val: IA)->second;
820 unsigned InstValB = B.ValueToNumber.find(Val: IB)->second;
821
822 // Ensure that the mappings for the instructions exists.
823 if (!compareAssignmentMapping(InstValA, InstValB, ValueNumberMappingA,
824 ValueNumberMappingB))
825 return false;
826
827 if (!compareAssignmentMapping(InstValA: InstValB, InstValB: InstValA, ValueNumberMappingA&: ValueNumberMappingB,
828 ValueNumberMappingB&: ValueNumberMappingA))
829 return false;
830
831 // We have different paths for commutative instructions and non-commutative
832 // instructions since commutative instructions could allow multiple mappings
833 // to certain values.
834 if (IA->isCommutative() && !isa<FPMathOperator>(Val: IA) &&
835 !isa<IntrinsicInst>(Val: IA)) {
836 if (!compareCommutativeOperandMapping(
837 A: {.IRSC: A, .OperVals: OperValsA, .ValueNumberMapping: ValueNumberMappingA},
838 B: {.IRSC: B, .OperVals: OperValsB, .ValueNumberMapping: ValueNumberMappingB}))
839 return false;
840 continue;
841 }
842
843 // Handle the non-commutative cases.
844 if (!compareNonCommutativeOperandMapping(
845 A: {.IRSC: A, .OperVals: OperValsA, .ValueNumberMapping: ValueNumberMappingA},
846 B: {.IRSC: B, .OperVals: OperValsB, .ValueNumberMapping: ValueNumberMappingB}))
847 return false;
848
849 // Here we check that between two corresponding instructions,
850 // when referring to a basic block in the same region, the
851 // relative locations are the same. And, that the instructions refer to
852 // basic blocks outside the region in the same corresponding locations.
853
854 // We are able to make the assumption about blocks outside of the region
855 // since the target block labels are considered values and will follow the
856 // same number matching that we defined for the other instructions in the
857 // region. So, at this point, in each location we target a specific block
858 // outside the region, we are targeting a corresponding block in each
859 // analagous location in the region we are comparing to.
860 if (!(isa<BranchInst>(Val: IA) && isa<BranchInst>(Val: IB)) &&
861 !(isa<PHINode>(Val: IA) && isa<PHINode>(Val: IB)))
862 continue;
863
864 SmallVector<int, 4> &RelBlockLocsA = ItA->RelativeBlockLocations;
865 SmallVector<int, 4> &RelBlockLocsB = ItB->RelativeBlockLocations;
866 ArrayRef<Value *> ABL = ItA->getBlockOperVals();
867 ArrayRef<Value *> BBL = ItB->getBlockOperVals();
868
869 // Check to make sure that the number of operands, and branching locations
870 // between BranchInsts is the same.
871 if (RelBlockLocsA.size() != RelBlockLocsB.size() &&
872 ABL.size() != BBL.size())
873 return false;
874
875 assert(RelBlockLocsA.size() == ABL.size() &&
876 "Block information vectors not the same size.");
877 assert(RelBlockLocsB.size() == BBL.size() &&
878 "Block information vectors not the same size.");
879
880 ZippedRelativeLocationsT ZippedRelativeLocations =
881 zip(t&: RelBlockLocsA, u&: RelBlockLocsB, args&: ABL, args&: BBL);
882 if (any_of(Range&: ZippedRelativeLocations,
883 P: [&A, &B](std::tuple<int, int, Value *, Value *> R) {
884 return !checkRelativeLocations(
885 A: {.IRSC: A, .RelativeLocation: std::get<0>(t&: R), .OperVal: std::get<2>(t&: R)},
886 B: {.IRSC: B, .RelativeLocation: std::get<1>(t&: R), .OperVal: std::get<3>(t&: R)});
887 }))
888 return false;
889 }
890 return true;
891}
892
893bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
894 const IRSimilarityCandidate &B) {
895 auto DoesOverlap = [](const IRSimilarityCandidate &X,
896 const IRSimilarityCandidate &Y) {
897 // Check:
898 // XXXXXX X starts before Y ends
899 // YYYYYYY Y starts after X starts
900 return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
901 };
902
903 return DoesOverlap(A, B) || DoesOverlap(B, A);
904}
905
906void IRSimilarityIdentifier::populateMapper(
907 Module &M, std::vector<IRInstructionData *> &InstrList,
908 std::vector<unsigned> &IntegerMapping) {
909
910 std::vector<IRInstructionData *> InstrListForModule;
911 std::vector<unsigned> IntegerMappingForModule;
912 // Iterate over the functions in the module to map each Instruction in each
913 // BasicBlock to an unsigned integer.
914 Mapper.initializeForBBs(M);
915
916 for (Function &F : M) {
917
918 if (F.empty())
919 continue;
920
921 for (BasicBlock &BB : F) {
922
923 // BB has potential to have similarity since it has a size greater than 2
924 // and can therefore match other regions greater than 2. Map it to a list
925 // of unsigned integers.
926 Mapper.convertToUnsignedVec(BB, InstrList&: InstrListForModule,
927 IntegerMapping&: IntegerMappingForModule);
928 }
929
930 BasicBlock::iterator It = F.begin()->end();
931 Mapper.mapToIllegalUnsigned(It, IntegerMappingForBB&: IntegerMappingForModule, InstrListForBB&: InstrListForModule,
932 End: true);
933 if (InstrListForModule.size() > 0)
934 Mapper.IDL->push_back(Node&: *InstrListForModule.back());
935 }
936
937 // Insert the InstrListForModule at the end of the overall InstrList so that
938 // we can have a long InstrList for the entire set of Modules being analyzed.
939 llvm::append_range(C&: InstrList, R&: InstrListForModule);
940 // Do the same as above, but for IntegerMapping.
941 llvm::append_range(C&: IntegerMapping, R&: IntegerMappingForModule);
942}
943
944void IRSimilarityIdentifier::populateMapper(
945 ArrayRef<std::unique_ptr<Module>> &Modules,
946 std::vector<IRInstructionData *> &InstrList,
947 std::vector<unsigned> &IntegerMapping) {
948
949 // Iterate over, and map the instructions in each module.
950 for (const std::unique_ptr<Module> &M : Modules)
951 populateMapper(M&: *M, InstrList, IntegerMapping);
952}
953
954/// From a repeated subsequence, find all the different instances of the
955/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
956/// the IRInstructionData in subsequence.
957///
958/// \param [in] Mapper - The instruction mapper for basic correctness checks.
959/// \param [in] InstrList - The vector that holds the instruction data.
960/// \param [in] IntegerMapping - The vector that holds the mapped integers.
961/// \param [out] CandsForRepSubstring - The vector to store the generated
962/// IRSimilarityCandidates.
963static void createCandidatesFromSuffixTree(
964 const IRInstructionMapper& Mapper, std::vector<IRInstructionData *> &InstrList,
965 std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
966 std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
967
968 unsigned StringLen = RS.Length;
969 if (StringLen < 2)
970 return;
971
972 // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
973 for (const unsigned &StartIdx : RS.StartIndices) {
974 unsigned EndIdx = StartIdx + StringLen - 1;
975
976 // Check that this subsequence does not contain an illegal instruction.
977 bool ContainsIllegal = false;
978 for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
979 unsigned Key = IntegerMapping[CurrIdx];
980 if (Key > Mapper.IllegalInstrNumber) {
981 ContainsIllegal = true;
982 break;
983 }
984 }
985
986 // If we have an illegal instruction, we should not create an
987 // IRSimilarityCandidate for this region.
988 if (ContainsIllegal)
989 continue;
990
991 // We are getting iterators to the instructions in this region of code
992 // by advancing the start and end indices from the start of the
993 // InstrList.
994 std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
995 std::advance(i&: StartIt, n: StartIdx);
996 std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
997 std::advance(i&: EndIt, n: EndIdx);
998
999 CandsForRepSubstring.emplace_back(args: StartIdx, args&: StringLen, args&: *StartIt, args&: *EndIt);
1000 }
1001}
1002
1003void IRSimilarityCandidate::createCanonicalRelationFrom(
1004 IRSimilarityCandidate &SourceCand,
1005 DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
1006 DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
1007 assert(SourceCand.CanonNumToNumber.size() != 0 &&
1008 "Base canonical relationship is empty!");
1009 assert(SourceCand.NumberToCanonNum.size() != 0 &&
1010 "Base canonical relationship is empty!");
1011
1012 assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
1013 assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
1014
1015 DenseSet<unsigned> UsedGVNs;
1016 // Iterate over the mappings provided from this candidate to SourceCand. We
1017 // are then able to map the GVN in this candidate to the same canonical number
1018 // given to the corresponding GVN in SourceCand.
1019 for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
1020 unsigned SourceGVN = GVNMapping.first;
1021
1022 assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
1023
1024 unsigned ResultGVN;
1025 // We need special handling if we have more than one potential value. This
1026 // means that there are at least two GVNs that could correspond to this GVN.
1027 // This could lead to potential swapping later on, so we make a decision
1028 // here to ensure a one-to-one mapping.
1029 if (GVNMapping.second.size() > 1) {
1030 bool Found = false;
1031 for (unsigned Val : GVNMapping.second) {
1032 // We make sure the target value number hasn't already been reserved.
1033 if (UsedGVNs.contains(V: Val))
1034 continue;
1035
1036 // We make sure that the opposite mapping is still consistent.
1037 DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
1038 FromSourceMapping.find(Val);
1039
1040 if (!It->second.contains(V: SourceGVN))
1041 continue;
1042
1043 // We pick the first item that satisfies these conditions.
1044 Found = true;
1045 ResultGVN = Val;
1046 break;
1047 }
1048
1049 assert(Found && "Could not find matching value for source GVN");
1050 (void)Found;
1051
1052 } else
1053 ResultGVN = *GVNMapping.second.begin();
1054
1055 // Whatever GVN is found, we mark it as used.
1056 UsedGVNs.insert(V: ResultGVN);
1057
1058 unsigned CanonNum = *SourceCand.getCanonicalNum(N: ResultGVN);
1059 CanonNumToNumber.insert(KV: std::make_pair(x&: CanonNum, y&: SourceGVN));
1060 NumberToCanonNum.insert(KV: std::make_pair(x&: SourceGVN, y&: CanonNum));
1061 }
1062
1063 DenseSet<BasicBlock *> BBSet;
1064 getBasicBlocks(BBSet);
1065 // Find canonical numbers for the BasicBlocks in the current candidate.
1066 // This is done by finding the corresponding value for the first instruction
1067 // in the block in the current candidate, finding the matching value in the
1068 // source candidate. Then by finding the parent of this value, use the
1069 // canonical number of the block in the source candidate for the canonical
1070 // number in the current candidate.
1071 for (BasicBlock *BB : BBSet) {
1072 unsigned BBGVNForCurrCand = ValueToNumber.find(Val: BB)->second;
1073
1074 // We can skip the BasicBlock if the canonical numbering has already been
1075 // found in a separate instruction.
1076 if (NumberToCanonNum.contains(Val: BBGVNForCurrCand))
1077 continue;
1078
1079 // If the basic block is the starting block, then the shared instruction may
1080 // not be the first instruction in the block, it will be the first
1081 // instruction in the similarity region.
1082 Value *FirstOutlineInst = BB == getStartBB()
1083 ? frontInstruction()
1084 : &*BB->instructionsWithoutDebug().begin();
1085
1086 unsigned FirstInstGVN = *getGVN(V: FirstOutlineInst);
1087 unsigned FirstInstCanonNum = *getCanonicalNum(N: FirstInstGVN);
1088 unsigned SourceGVN = *SourceCand.fromCanonicalNum(N: FirstInstCanonNum);
1089 Value *SourceV = *SourceCand.fromGVN(Num: SourceGVN);
1090 BasicBlock *SourceBB = cast<Instruction>(Val: SourceV)->getParent();
1091 unsigned SourceBBGVN = *SourceCand.getGVN(V: SourceBB);
1092 unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(N: SourceBBGVN);
1093 CanonNumToNumber.insert(KV: std::make_pair(x&: SourceCanonBBGVN, y&: BBGVNForCurrCand));
1094 NumberToCanonNum.insert(KV: std::make_pair(x&: BBGVNForCurrCand, y&: SourceCanonBBGVN));
1095 }
1096}
1097
1098void IRSimilarityCandidate::createCanonicalRelationFrom(
1099 IRSimilarityCandidate &SourceCand, IRSimilarityCandidate &SourceCandLarge,
1100 IRSimilarityCandidate &TargetCandLarge) {
1101 assert(!SourceCand.CanonNumToNumber.empty() &&
1102 "Canonical Relationship is non-empty");
1103 assert(!SourceCand.NumberToCanonNum.empty() &&
1104 "Canonical Relationship is non-empty");
1105
1106 assert(!SourceCandLarge.CanonNumToNumber.empty() &&
1107 "Canonical Relationship is non-empty");
1108 assert(!SourceCandLarge.NumberToCanonNum.empty() &&
1109 "Canonical Relationship is non-empty");
1110
1111 assert(!TargetCandLarge.CanonNumToNumber.empty() &&
1112 "Canonical Relationship is non-empty");
1113 assert(!TargetCandLarge.NumberToCanonNum.empty() &&
1114 "Canonical Relationship is non-empty");
1115
1116 assert(CanonNumToNumber.empty() && "Canonical Relationship is non-empty");
1117 assert(NumberToCanonNum.empty() && "Canonical Relationship is non-empty");
1118
1119 // We're going to use the larger candidates as a "bridge" to create the
1120 // canonical number for the target candidate since we have idetified two
1121 // candidates as subsequences of larger sequences, and therefore must be
1122 // structurally similar.
1123 for (std::pair<Value *, unsigned> &ValueNumPair : ValueToNumber) {
1124 Value *CurrVal = ValueNumPair.first;
1125 unsigned TargetCandGVN = ValueNumPair.second;
1126
1127 // Find the numbering in the large candidate that surrounds the
1128 // current candidate.
1129 std::optional<unsigned> OLargeTargetGVN = TargetCandLarge.getGVN(V: CurrVal);
1130 assert(OLargeTargetGVN.has_value() && "GVN not found for Value");
1131
1132 // Get the canonical numbering in the large target candidate.
1133 std::optional<unsigned> OTargetCandCanon =
1134 TargetCandLarge.getCanonicalNum(N: OLargeTargetGVN.value());
1135 assert(OTargetCandCanon.has_value() &&
1136 "Canononical Number not found for GVN");
1137
1138 // Get the GVN in the large source candidate from the canonical numbering.
1139 std::optional<unsigned> OLargeSourceGVN =
1140 SourceCandLarge.fromCanonicalNum(N: OTargetCandCanon.value());
1141 assert(OLargeSourceGVN.has_value() &&
1142 "GVN Number not found for Canonical Number");
1143
1144 // Get the Value from the GVN in the large source candidate.
1145 std::optional<Value *> OLargeSourceV =
1146 SourceCandLarge.fromGVN(Num: OLargeSourceGVN.value());
1147 assert(OLargeSourceV.has_value() && "Value not found for GVN");
1148
1149 // Get the GVN number for the Value in the source candidate.
1150 std::optional<unsigned> OSourceGVN =
1151 SourceCand.getGVN(V: OLargeSourceV.value());
1152 assert(OSourceGVN.has_value() && "GVN Number not found for Value");
1153
1154 // Get the canonical numbering from the GVN/
1155 std::optional<unsigned> OSourceCanon =
1156 SourceCand.getCanonicalNum(N: OSourceGVN.value());
1157 assert(OSourceCanon.has_value() && "Canon Number not found for GVN");
1158
1159 // Insert the canonical numbering and GVN pair into their respective
1160 // mappings.
1161 CanonNumToNumber.insert(
1162 KV: std::make_pair(x&: OSourceCanon.value(), y&: TargetCandGVN));
1163 NumberToCanonNum.insert(
1164 KV: std::make_pair(x&: TargetCandGVN, y&: OSourceCanon.value()));
1165 }
1166}
1167
1168void IRSimilarityCandidate::createCanonicalMappingFor(
1169 IRSimilarityCandidate &CurrCand) {
1170 assert(CurrCand.CanonNumToNumber.size() == 0 &&
1171 "Canonical Relationship is non-empty");
1172 assert(CurrCand.NumberToCanonNum.size() == 0 &&
1173 "Canonical Relationship is non-empty");
1174
1175 unsigned CanonNum = 0;
1176 // Iterate over the value numbers found, the order does not matter in this
1177 // case.
1178 for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
1179 CurrCand.NumberToCanonNum.insert(KV: std::make_pair(x&: NumToVal.first, y&: CanonNum));
1180 CurrCand.CanonNumToNumber.insert(KV: std::make_pair(x&: CanonNum, y&: NumToVal.first));
1181 CanonNum++;
1182 }
1183}
1184
1185/// Look for larger IRSimilarityCandidates From the previously matched
1186/// IRSimilarityCandidates that fully contain \p CandA or \p CandB. If there is
1187/// an overlap, return a pair of structurally similar, larger
1188/// IRSimilarityCandidates.
1189///
1190/// \param [in] CandA - The first candidate we are trying to determine the
1191/// structure of.
1192/// \param [in] CandB - The second candidate we are trying to determine the
1193/// structure of.
1194/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1195/// a circuit to the IRSimilarityCandidates that include this instruction.
1196/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1197/// number representing the structural group assigned to it.
1198static std::optional<
1199 std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1200CheckLargerCands(
1201 IRSimilarityCandidate &CandA, IRSimilarityCandidate &CandB,
1202 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1203 DenseMap<IRSimilarityCandidate *, unsigned> &CandToGroup) {
1204 DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandA;
1205 DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandB;
1206 DenseSet<unsigned> IncludedGroupsA;
1207 DenseSet<unsigned> IncludedGroupsB;
1208
1209 // Find the overall similarity group numbers that fully contain the candidate,
1210 // and record the larger candidate for each group.
1211 auto IdxToCandidateIt = IndexToIncludedCand.find(Val: CandA.getStartIdx());
1212 std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1213 Result;
1214
1215 unsigned CandAStart = CandA.getStartIdx();
1216 unsigned CandAEnd = CandA.getEndIdx();
1217 unsigned CandBStart = CandB.getStartIdx();
1218 unsigned CandBEnd = CandB.getEndIdx();
1219 if (IdxToCandidateIt == IndexToIncludedCand.end())
1220 return Result;
1221 for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1222 if (MatchedCand->getStartIdx() > CandAStart ||
1223 (MatchedCand->getEndIdx() < CandAEnd))
1224 continue;
1225 unsigned GroupNum = CandToGroup.find(Val: MatchedCand)->second;
1226 IncludedGroupAndCandA.insert(KV: std::make_pair(x&: GroupNum, y&: MatchedCand));
1227 IncludedGroupsA.insert(V: GroupNum);
1228 }
1229
1230 // Find the overall similarity group numbers that fully contain the next
1231 // candidate, and record the larger candidate for each group.
1232 IdxToCandidateIt = IndexToIncludedCand.find(Val: CandBStart);
1233 if (IdxToCandidateIt == IndexToIncludedCand.end())
1234 return Result;
1235 for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1236 if (MatchedCand->getStartIdx() > CandBStart ||
1237 MatchedCand->getEndIdx() < CandBEnd)
1238 continue;
1239 unsigned GroupNum = CandToGroup.find(Val: MatchedCand)->second;
1240 IncludedGroupAndCandB.insert(KV: std::make_pair(x&: GroupNum, y&: MatchedCand));
1241 IncludedGroupsB.insert(V: GroupNum);
1242 }
1243
1244 // Find the intersection between the two groups, these are the groups where
1245 // the larger candidates exist.
1246 set_intersect(S1&: IncludedGroupsA, S2: IncludedGroupsB);
1247
1248 // If there is no intersection between the sets, then we cannot determine
1249 // whether or not there is a match.
1250 if (IncludedGroupsA.empty())
1251 return Result;
1252
1253 // Create a pair that contains the larger candidates.
1254 auto ItA = IncludedGroupAndCandA.find(Val: *IncludedGroupsA.begin());
1255 auto ItB = IncludedGroupAndCandB.find(Val: *IncludedGroupsA.begin());
1256 Result = std::make_pair(x&: ItA->second, y&: ItB->second);
1257 return Result;
1258}
1259
1260/// From the list of IRSimilarityCandidates, perform a comparison between each
1261/// IRSimilarityCandidate to determine if there are overlapping
1262/// IRInstructionData, or if they do not have the same structure.
1263///
1264/// \param [in] CandsForRepSubstring - The vector containing the
1265/// IRSimilarityCandidates.
1266/// \param [out] StructuralGroups - the mapping of unsigned integers to vector
1267/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
1268/// vector are structurally similar to one another.
1269/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1270/// a circuit to the IRSimilarityCandidates that include this instruction.
1271/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1272/// number representing the structural group assigned to it.
1273static void findCandidateStructures(
1274 std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
1275 DenseMap<unsigned, SimilarityGroup> &StructuralGroups,
1276 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1277 DenseMap<IRSimilarityCandidate *, unsigned> &CandToOverallGroup
1278 ) {
1279 std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
1280 InnerCandEndIt;
1281
1282 // IRSimilarityCandidates each have a structure for operand use. It is
1283 // possible that two instances of the same subsequences have different
1284 // structure. Each type of structure found is assigned a number. This
1285 // DenseMap maps an IRSimilarityCandidate to which type of similarity
1286 // discovered it fits within.
1287 DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1288
1289 // Find the compatibility from each candidate to the others to determine
1290 // which candidates overlap and which have the same structure by mapping
1291 // each structure to a different group.
1292 bool SameStructure;
1293 bool Inserted;
1294 unsigned CurrentGroupNum = 0;
1295 unsigned OuterGroupNum;
1296 DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
1297 DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
1298 DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
1299
1300 // Iterate over the candidates to determine its structural and overlapping
1301 // compatibility with other instructions
1302 DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
1303 DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
1304 for (CandIt = CandsForRepSubstring.begin(),
1305 CandEndIt = CandsForRepSubstring.end();
1306 CandIt != CandEndIt; CandIt++) {
1307
1308 // Determine if it has an assigned structural group already.
1309 // If not, we assign it one, and add it to our mapping.
1310 std::tie(args&: CandToGroupIt, args&: Inserted) =
1311 CandToGroup.try_emplace(Key: &*CandIt, Args&: CurrentGroupNum);
1312 if (Inserted)
1313 ++CurrentGroupNum;
1314
1315 // Get the structural group number from the iterator.
1316 OuterGroupNum = CandToGroupIt->second;
1317
1318 // Check if we already have a list of IRSimilarityCandidates for the current
1319 // structural group. Create one if one does not exist.
1320 CurrentGroupPair = StructuralGroups.find(Val: OuterGroupNum);
1321 if (CurrentGroupPair == StructuralGroups.end()) {
1322 IRSimilarityCandidate::createCanonicalMappingFor(CurrCand&: *CandIt);
1323 std::tie(args&: CurrentGroupPair, args&: Inserted) = StructuralGroups.insert(
1324 KV: std::make_pair(x&: OuterGroupNum, y: SimilarityGroup({*CandIt})));
1325 }
1326
1327 // Iterate over the IRSimilarityCandidates following the current
1328 // IRSimilarityCandidate in the list to determine whether the two
1329 // IRSimilarityCandidates are compatible. This is so we do not repeat pairs
1330 // of IRSimilarityCandidates.
1331 for (InnerCandIt = std::next(x: CandIt),
1332 InnerCandEndIt = CandsForRepSubstring.end();
1333 InnerCandIt != InnerCandEndIt; InnerCandIt++) {
1334
1335 // We check if the inner item has a group already, if it does, we skip it.
1336 CandToGroupItInner = CandToGroup.find(Val: &*InnerCandIt);
1337 if (CandToGroupItInner != CandToGroup.end())
1338 continue;
1339
1340 // Check if we have found structural similarity between two candidates
1341 // that fully contains the first and second candidates.
1342 std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1343 LargerPair = CheckLargerCands(
1344 CandA&: *CandIt, CandB&: *InnerCandIt, IndexToIncludedCand, CandToGroup&: CandToOverallGroup);
1345
1346 // If a pair was found, it means that we can assume that these smaller
1347 // substrings are also structurally similar. Use the larger candidates to
1348 // determine the canonical mapping between the two sections.
1349 if (LargerPair.has_value()) {
1350 SameStructure = true;
1351 InnerCandIt->createCanonicalRelationFrom(
1352 SourceCand&: *CandIt, SourceCandLarge&: *LargerPair.value().first, TargetCandLarge&: *LargerPair.value().second);
1353 CandToGroup.insert(KV: std::make_pair(x: &*InnerCandIt, y&: OuterGroupNum));
1354 CurrentGroupPair->second.push_back(x: *InnerCandIt);
1355 continue;
1356 }
1357
1358 // Otherwise we determine if they have the same structure and add it to
1359 // vector if they match.
1360 ValueNumberMappingA.clear();
1361 ValueNumberMappingB.clear();
1362 SameStructure = IRSimilarityCandidate::compareStructure(
1363 A: *CandIt, B: *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
1364 if (!SameStructure)
1365 continue;
1366
1367 InnerCandIt->createCanonicalRelationFrom(SourceCand&: *CandIt, ToSourceMapping&: ValueNumberMappingA,
1368 FromSourceMapping&: ValueNumberMappingB);
1369 CandToGroup.insert(KV: std::make_pair(x: &*InnerCandIt, y&: OuterGroupNum));
1370 CurrentGroupPair->second.push_back(x: *InnerCandIt);
1371 }
1372 }
1373}
1374
1375void IRSimilarityIdentifier::findCandidates(
1376 std::vector<IRInstructionData *> &InstrList,
1377 std::vector<unsigned> &IntegerMapping) {
1378 SuffixTree ST(IntegerMapping);
1379
1380 std::vector<IRSimilarityCandidate> CandsForRepSubstring;
1381 std::vector<SimilarityGroup> NewCandidateGroups;
1382
1383 DenseMap<unsigned, SimilarityGroup> StructuralGroups;
1384 DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> IndexToIncludedCand;
1385 DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1386
1387 // Iterate over the subsequences found by the Suffix Tree to create
1388 // IRSimilarityCandidates for each repeated subsequence and determine which
1389 // instances are structurally similar to one another.
1390
1391 // Sort the suffix tree from longest substring to shortest.
1392 std::vector<SuffixTree::RepeatedSubstring> RSes;
1393 for (SuffixTree::RepeatedSubstring &RS : ST)
1394 RSes.push_back(x: RS);
1395
1396 llvm::stable_sort(Range&: RSes, C: [](const SuffixTree::RepeatedSubstring &LHS,
1397 const SuffixTree::RepeatedSubstring &RHS) {
1398 return LHS.Length > RHS.Length;
1399 });
1400 for (SuffixTree::RepeatedSubstring &RS : RSes) {
1401 createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS,
1402 CandsForRepSubstring);
1403
1404 if (CandsForRepSubstring.size() < 2)
1405 continue;
1406
1407 findCandidateStructures(CandsForRepSubstring, StructuralGroups,
1408 IndexToIncludedCand, CandToOverallGroup&: CandToGroup);
1409 for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups) {
1410 // We only add the group if it contains more than one
1411 // IRSimilarityCandidate. If there is only one, that means there is no
1412 // other repeated subsequence with the same structure.
1413 if (Group.second.size() > 1) {
1414 SimilarityCandidates->push_back(x: Group.second);
1415 // Iterate over each candidate in the group, and add an entry for each
1416 // instruction included with a mapping to a set of
1417 // IRSimilarityCandidates that include that instruction.
1418 for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) {
1419 for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx();
1420 Idx <= Edx; ++Idx)
1421 IndexToIncludedCand[Idx].insert(V: &IRCand);
1422 // Add mapping of candidate to the overall similarity group number.
1423 CandToGroup.insert(
1424 KV: std::make_pair(x: &IRCand, y: SimilarityCandidates->size() - 1));
1425 }
1426 }
1427 }
1428
1429 CandsForRepSubstring.clear();
1430 StructuralGroups.clear();
1431 NewCandidateGroups.clear();
1432 }
1433}
1434
1435SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
1436 ArrayRef<std::unique_ptr<Module>> Modules) {
1437 resetSimilarityCandidates();
1438
1439 std::vector<IRInstructionData *> InstrList;
1440 std::vector<unsigned> IntegerMapping;
1441 Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1442 Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1443 Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1444 Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1445 Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1446
1447 populateMapper(Modules, InstrList, IntegerMapping);
1448 findCandidates(InstrList, IntegerMapping);
1449
1450 return *SimilarityCandidates;
1451}
1452
1453SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
1454 resetSimilarityCandidates();
1455 Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1456 Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1457 Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1458 Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1459 Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1460
1461 std::vector<IRInstructionData *> InstrList;
1462 std::vector<unsigned> IntegerMapping;
1463
1464 populateMapper(M, InstrList, IntegerMapping);
1465 findCandidates(InstrList, IntegerMapping);
1466
1467 return *SimilarityCandidates;
1468}
1469
1470INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
1471 "ir-similarity-identifier", false, true)
1472
1473IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
1474 : ModulePass(ID) {}
1475
1476bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
1477 IRSI.reset(p: new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1478 MatchCallsByName, !DisableIntrinsics,
1479 false));
1480 return false;
1481}
1482
1483bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
1484 IRSI.reset();
1485 return false;
1486}
1487
1488bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
1489 IRSI->findSimilarity(M);
1490 return false;
1491}
1492
1493AnalysisKey IRSimilarityAnalysis::Key;
1494IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
1495 ModuleAnalysisManager &) {
1496 auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1497 MatchCallsByName, !DisableIntrinsics,
1498 false);
1499 IRSI.findSimilarity(M);
1500 return IRSI;
1501}
1502
1503PreservedAnalyses
1504IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
1505 IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(IR&: M);
1506 std::optional<SimilarityGroupList> &SimilarityCandidatesOpt =
1507 IRSI.getSimilarity();
1508
1509 for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
1510 OS << CandVec.size() << " candidates of length "
1511 << CandVec.begin()->getLength() << ". Found in: \n";
1512 for (IRSimilarityCandidate &Cand : CandVec) {
1513 OS << " Function: " << Cand.front()->Inst->getFunction()->getName().str()
1514 << ", Basic Block: ";
1515 if (Cand.front()->Inst->getParent()->getName().str() == "")
1516 OS << "(unnamed)";
1517 else
1518 OS << Cand.front()->Inst->getParent()->getName().str();
1519 OS << "\n Start Instruction: ";
1520 Cand.frontInstruction()->print(O&: OS);
1521 OS << "\n End Instruction: ";
1522 Cand.backInstruction()->print(O&: OS);
1523 OS << "\n";
1524 }
1525 }
1526
1527 return PreservedAnalyses::all();
1528}
1529
1530char IRSimilarityIdentifierWrapperPass::ID = 0;
1531