IR2Vec.cpp source code [llvm_projects/llvm/lib/Analysis/IR2Vec.cpp]

1	//===- IR2Vec.cpp - Implementation of IR2Vec -----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM
4	// Exceptions. See the LICENSE file for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements the IR2Vec algorithm.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Analysis/IR2Vec.h"
15
16	#include "llvm/ADT/DepthFirstIterator.h"
17	#include "llvm/ADT/Sequence.h"
18	#include "llvm/ADT/SmallVector.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/IR/CFG.h"
21	#include "llvm/IR/Module.h"
22	#include "llvm/IR/PassManager.h"
23	#include "llvm/Support/Debug.h"
24	#include "llvm/Support/Errc.h"
25	#include "llvm/Support/Error.h"
26	#include "llvm/Support/ErrorHandling.h"
27	#include "llvm/Support/Format.h"
28	#include "llvm/Support/MemoryBuffer.h"
29
30	using namespace llvm;
31	using namespace ir2vec;
32
33	#define DEBUG_TYPE "ir2vec"
34
35	STATISTIC(VocabMissCounter,
36	"Number of lookups to entities not present in the vocabulary");
37
38	namespace llvm {
39	namespace ir2vec {
40	cl::OptionCategory IR2VecCategory("IR2Vec Options");
41
42	// FIXME: Use a default vocab when not specified
43	cl::opt<std::string>
44	VocabFile("ir2vec-vocab-path", cl::Optional,
45	cl::desc ("Path to the vocabulary file for IR2Vec"), cl::init(Val: ""),
46	cl::cat (IR2VecCategory));
47	cl::opt<float> OpcWeight("ir2vec-opc-weight", cl::Optional, cl::init(Val: `1.0`),
48	cl::desc ("Weight for opcode embeddings"),
49	cl::cat (IR2VecCategory));
50	cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional, cl::init(Val: `0.5`),
51	cl::desc ("Weight for type embeddings"),
52	cl::cat (IR2VecCategory));
53	cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional, cl::init(Val: `0.2`),
54	cl::desc ("Weight for argument embeddings"),
55	cl::cat (IR2VecCategory));
56	cl::opt<IR2VecKind> IR2VecEmbeddingKind(
57	"ir2vec-kind", cl::Optional,
58	cl::values(clEnumValN(IR2VecKind::Symbolic, "symbolic",
59	"Generate symbolic embeddings"),
60	clEnumValN(IR2VecKind::FlowAware, "flow-aware",
61	"Generate flow-aware embeddings")),
62	cl::init(Val: IR2VecKind::Symbolic), cl::desc ("IR2Vec embedding kind"),
63	cl::cat (IR2VecCategory));
64
65	} // namespace ir2vec
66	} // namespace llvm
67
68	AnalysisKey IR2VecVocabAnalysis::Key;
69
70	// ==----------------------------------------------------------------------===//
71	// Local helper functions
72	//===----------------------------------------------------------------------===//
73	namespace llvm::json {
74	inline bool fromJSON(const llvm::json::Value &E, Embedding &Out,
75	llvm::json::Path P) {
76	std::vector<double> TempOut;
77	if (!llvm::json::fromJSON(E, Out&: TempOut, P))
78	return false;
79	Out = Embedding (std::move(TempOut));
80	return true;
81	}
82	} // namespace llvm::json
83
84	// ==----------------------------------------------------------------------===//
85	// Embedding
86	//===----------------------------------------------------------------------===//
87	Embedding &Embedding::operator+=(const Embedding &RHS) {
88	assert(this->size() == RHS.size() && "Vectors must have the same dimension");
89	std::transform(first1: this->begin(), last1: this->end(), first2: RHS.begin(), result: this->begin(),
90	binary_op: std::plus<double>());
91	return *this;
92	}
93
94	Embedding Embedding::operator+(const Embedding &RHS) const {
95	Embedding Result(*this);
96	Result += RHS;
97	return Result;
98	}
99
100	Embedding &Embedding::operator-=(const Embedding &RHS) {
101	assert(this->size() == RHS.size() && "Vectors must have the same dimension");
102	std::transform(first1: this->begin(), last1: this->end(), first2: RHS.begin(), result: this->begin(),
103	binary_op: std::minus<double>());
104	return *this;
105	}
106
107	Embedding Embedding::operator-(const Embedding &RHS) const {
108	Embedding Result(*this);
109	Result -= RHS;
110	return Result;
111	}
112
113	Embedding &Embedding::operator=(double* Factor) {
114	std::transform(first: this->begin(), last: this->end(), result: this->begin(),
115	unary_op: [Factor](double Elem) { return Elem * Factor; });
116	return *this;
117	}
118
119	Embedding Embedding::operator(double* Factor) const {
120	Embedding Result(*this);
121	Result *= Factor;
122	return Result;
123	}
124
125	Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) {
126	assert(this->size() == Src.size() && "Vectors must have the same dimension");
127	for (size_t Itr = `0`; Itr < this->size(); ++Itr)
128	(*this)[Itr] += Src [Itr] * Factor;
129	return *this;
130	}
131
132	bool Embedding::approximatelyEquals(const Embedding &RHS,
133	double Tolerance) const {
134	assert(this->size() == RHS.size() && "Vectors must have the same dimension");
135	for (size_t Itr = `0`; Itr < this->size(); ++Itr)
136	if (std::abs(x: (*this)[Itr] - RHS [Itr]) > Tolerance) {
137	LLVM_DEBUG(errs() << "Embedding mismatch at index " << Itr << ": "
138	<< (*this)[Itr] << " vs " << RHS[Itr]
139	<< "; Tolerance: " << Tolerance << "\n");
140	return false;
141	}
142	return true;
143	}
144
145	void Embedding::print(raw_ostream &OS) const {
146	OS << " [";
147	for (const auto &Elem : Data)
148	OS << " " << format(Fmt: "%.2f", Vals: Elem) << " ";
149	OS << "]\n";
150	}
151
152	// ==----------------------------------------------------------------------===//
153	// Embedder and its subclasses
154	//===----------------------------------------------------------------------===//
155
156	std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
157	const Vocabulary &Vocab) {
158	switch (Mode) {
159	case IR2VecKind::Symbolic:
160	return std::make_unique<SymbolicEmbedder>(args: F, args: Vocab);
161	case IR2VecKind::FlowAware:
162	return std::make_unique<FlowAwareEmbedder>(args: F, args: Vocab);
163	}
164	return nullptr;
165	}
166
167	Embedding Embedder::computeEmbeddings() const {
168	Embedding FuncVector(Dimension, `0.0`);
169
170	if (F.isDeclaration())
171	return FuncVector;
172
173	// Consider only the basic blocks that are reachable from entry
174	for (const BasicBlock *BB : depth_first(G: &F))
175	FuncVector += computeEmbeddings(BB: *BB);
176	return FuncVector;
177	}
178
179	Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const {
180	Embedding BBVector(Dimension, `0`);
181
182	// We consider only the non-debug and non-pseudo instructions
183	for (const auto &I : BB.instructionsWithoutDebug())
184	BBVector += computeEmbeddings(I);
185	return BBVector;
186	}
187
188	Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const {
189	// Currently, we always (re)compute the embeddings for symbolic embedder.
190	// This is cheaper than caching the vectors.
191	Embedding ArgEmb(Dimension, `0`);
192	for (const auto &Op : I.operands())
193	ArgEmb += Vocab [*Op];
194	auto InstVector =
195	Vocab [I.getOpcode()] + Vocab [I.getType()->getTypeID()] + ArgEmb;
196	if (const auto *IC = dyn_cast<CmpInst>(Val: &I))
197	InstVector += Vocab [IC->getPredicate()];
198	return InstVector;
199	}
200
201	Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const {
202	// If we have already computed the embedding for this instruction, return it
203	auto It = InstVecMap.find(Val: &I);
204	if (It != InstVecMap.end())
205	return It ->second;
206
207	// TODO: Handle call instructions differently.
208	// For now, we treat them like other instructions
209	Embedding ArgEmb(Dimension, `0`);
210	for (const auto &Op : I.operands()) {
211	// If the operand is defined elsewhere, we use its embedding
212	if (const auto *DefInst = dyn_cast<Instruction>(Val: Op)) {
213	auto DefIt = InstVecMap.find(Val: DefInst);
214	// Fixme (#159171): Ideally we should never miss an instruction
215	// embedding here.
216	// But when we have cyclic dependencies (e.g., phi
217	// nodes), we might miss the embedding. In such cases, we fall back to
218	// using the vocabulary embedding. This can be fixed by iterating to a
219	// fixed-point, or by using a simple solver for the set of simultaneous
220	// equations.
221	// Another case when we might miss an instruction embedding is when
222	// the operand instruction is in a different basic block that has not
223	// been processed yet. This can be fixed by processing the basic blocks
224	// in a topological order.
225	if (DefIt != InstVecMap.end())
226	ArgEmb += DefIt ->second;
227	else
228	ArgEmb += Vocab [*Op];
229	}
230	// If the operand is not defined by an instruction, we use the
231	// vocabulary
232	else {
233	LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
234	<< Op << "=" << Vocab[Op][`0`] << "\n");
235	ArgEmb += Vocab [*Op];
236	}
237	}
238	// Create the instruction vector by combining opcode, type, and arguments
239	// embeddings
240	auto InstVector =
241	Vocab [I.getOpcode()] + Vocab [I.getType()->getTypeID()] + ArgEmb;
242	if (const auto *IC = dyn_cast<CmpInst>(Val: &I))
243	InstVector += Vocab [IC->getPredicate()];
244	InstVecMap [&I] = InstVector;
245	return InstVector;
246	}
247
248	// ==----------------------------------------------------------------------===//
249	// VocabStorage
250	//===----------------------------------------------------------------------===//
251
252	VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData)
253	: Sections (std::move(SectionData)), TotalSize([&] {
254	assert(!Sections.empty() && "Vocabulary has no sections");
255	// Compute total size across all sections
256	size_t Size = `0`;
257	for (const auto &Section : Sections) {
258	assert(!Section.empty() && "Vocabulary section is empty");
259	Size += Section.size();
260	}
261	return Size;
262	}()),
263	Dimension([&] {
264	// Get dimension from the first embedding in the first section - all
265	// embeddings must have the same dimension
266	assert(!Sections.empty() && "Vocabulary has no sections");
267	assert(!Sections[`0`].empty() && "First section of vocabulary is empty");
268	unsigned ExpectedDim = static_cast<unsigned>(Sections [`0`][`0`].size());
269
270	// Verify that all embeddings across all sections have the same
271	// dimension
272	[[maybe_unused]] auto allSameDim =
273	[ExpectedDim](const std::vector<Embedding> &Section) {
274	return std::all_of(first: Section.begin(), last: Section.end(),
275	pred: [ExpectedDim](const Embedding &Emb) {
276	return Emb.size() == ExpectedDim;
277	});
278	};
279	assert(std::all_of(Sections.begin(), Sections.end(), allSameDim) &&
280	"All embeddings must have the same dimension");
281
282	return ExpectedDim;
283	}()) {}
284
285	const Embedding &VocabStorage::const_iterator::operator() const* {
286	assert(SectionId < Storage->Sections.size() && "Invalid section ID");
287	assert(LocalIndex < Storage->Sections[SectionId].size() &&
288	"Local index out of range");
289	return Storage->Sections [SectionId][LocalIndex];
290	}
291
292	VocabStorage::const_iterator &VocabStorage::const_iterator::operator++() {
293	++LocalIndex;
294	// Check if we need to move to the next section
295	if (SectionId < Storage->getNumSections() &&
296	LocalIndex >= Storage->Sections [SectionId].size()) {
297	assert(LocalIndex == Storage->Sections[SectionId].size() &&
298	"Local index should be at the end of the current section");
299	LocalIndex = `0`;
300	++SectionId;
301	}
302	return *this;
303	}
304
305	bool VocabStorage::const_iterator::operator==(
306	const const_iterator &Other) const {
307	return Storage == Other.Storage && SectionId == Other.SectionId &&
308	LocalIndex == Other.LocalIndex;
309	}
310
311	bool VocabStorage::const_iterator::operator!=(
312	const const_iterator &Other) const {
313	return !(*this == Other);
314	}
315
316	Error VocabStorage::parseVocabSection(StringRef Key,
317	const json::Value &ParsedVocabValue,
318	VocabMap &TargetVocab, unsigned &Dim) {
319	json::Path::Root Path("");
320	const json::Object *RootObj = ParsedVocabValue.getAsObject();
321	if (!RootObj)
322	return createStringError(EC: errc::invalid_argument,
323	S: "JSON root is not an object");
324
325	const json::Value *SectionValue = RootObj->get(K: Key);
326	if (!SectionValue)
327	return createStringError(EC: errc::invalid_argument,
328	S: "Missing '" + std::string (Key) +
329	"' section in vocabulary file");
330	if (!json::fromJSON(E: *SectionValue, Out&: TargetVocab, P: Path))
331	return createStringError(EC: errc::illegal_byte_sequence,
332	S: "Unable to parse '" + std::string (Key) +
333	"' section from vocabulary");
334
335	Dim = TargetVocab.begin()->second.size();
336	if (Dim == `0`)
337	return createStringError(EC: errc::illegal_byte_sequence,
338	S: "Dimension of '" + std::string (Key) +
339	"' section of the vocabulary is zero");
340
341	if (!std::all_of(first: TargetVocab.begin(), last: TargetVocab.end(),
342	pred: [Dim](const std::pair<StringRef, Embedding> &Entry) {
343	return Entry.second.size() == Dim;
344	}))
345	return createStringError(
346	EC: errc::illegal_byte_sequence,
347	S: "All vectors in the '" + std::string (Key) +
348	"' section of the vocabulary are not of the same dimension");
349
350	return Error::success();
351	}
352
353	// ==----------------------------------------------------------------------===//
354	// Vocabulary
355	//===----------------------------------------------------------------------===//
356
357	StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
358	assert(Opcode >= `1` && Opcode <= MaxOpcodes && "Invalid opcode");
359	#define HANDLE_INST(NUM, OPCODE, CLASS) \
360	if (Opcode == NUM) { \
361	return #OPCODE; \
362	}
363	#include "llvm/IR/Instruction.def"
364	#undef HANDLE_INST
365	return "UnknownOpcode";
366	}
367
368	// Helper function to classify an operand into OperandKind
369	Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
370	if (isa<Function>(Val: Op))
371	return OperandKind::FunctionID;
372	if (isa<PointerType>(Val: Op->getType()))
373	return OperandKind::PointerID;
374	if (isa<Constant>(Val: Op))
375	return OperandKind::ConstantID;
376	return OperandKind::VariableID;
377	}
378
379	unsigned Vocabulary::getPredicateLocalIndex(CmpInst::Predicate P) {
380	if (P >= CmpInst::FIRST_FCMP_PREDICATE && P <= CmpInst::LAST_FCMP_PREDICATE)
381	return P - CmpInst::FIRST_FCMP_PREDICATE;
382	else
383	return P - CmpInst::FIRST_ICMP_PREDICATE +
384	(CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + `1`);
385	}
386
387	CmpInst::Predicate Vocabulary::getPredicateFromLocalIndex(unsigned LocalIndex) {
388	unsigned fcmpRange =
389	CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + `1`;
390	if (LocalIndex < fcmpRange)
391	return static_cast<CmpInst::Predicate>(CmpInst::FIRST_FCMP_PREDICATE +
392	LocalIndex);
393	else
394	return static_cast<CmpInst::Predicate>(CmpInst::FIRST_ICMP_PREDICATE +
395	LocalIndex - fcmpRange);
396	}
397
398	StringRef Vocabulary::getVocabKeyForPredicate(CmpInst::Predicate Pred) {
399	static SmallString<`16`> PredNameBuffer;
400	if (Pred < CmpInst::FIRST_ICMP_PREDICATE)
401	PredNameBuffer = "FCMP_";
402	else
403	PredNameBuffer = "ICMP_";
404	PredNameBuffer += CmpInst::getPredicateName(P: Pred);
405	return PredNameBuffer;
406	}
407
408	StringRef Vocabulary::getStringKey(unsigned Pos) {
409	assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary");
410	// Opcode
411	if (Pos < MaxOpcodes)
412	return getVocabKeyForOpcode(Opcode: Pos + `1`);
413	// Type
414	if (Pos < OperandBaseOffset)
415	return getVocabKeyForCanonicalTypeID(
416	CType: static_cast<CanonicalTypeID>(Pos - MaxOpcodes));
417	// Operand
418	if (Pos < PredicateBaseOffset)
419	return getVocabKeyForOperandKind(
420	Kind: static_cast<OperandKind>(Pos - OperandBaseOffset));
421	// Predicates
422	return getVocabKeyForPredicate(Pred: getPredicate(Index: Pos - PredicateBaseOffset));
423	}
424
425	// For now, assume vocabulary is stable unless explicitly invalidated.
426	bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA,
427	ModuleAnalysisManager::Invalidator &Inv) const {
428	auto PAC = PA.getChecker<IR2VecVocabAnalysis>();
429	return !(PAC.preservedWhenStateless());
430	}
431
432	VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) {
433	float DummyVal = `0.1f`;
434
435	// Create sections for opcodes, types, operands, and predicates
436	// Order must match Vocabulary::Section enum
437	std::vector<std::vector<Embedding>> Sections;
438	Sections.reserve(n: `4`);
439
440	// Opcodes section
441	std::vector<Embedding> OpcodeSec;
442	OpcodeSec.reserve(n: MaxOpcodes);
443	for (unsigned I = `0`; I < MaxOpcodes; ++I) {
444	OpcodeSec.emplace_back(args&: Dim, args&: DummyVal);
445	DummyVal += `0.1f`;
446	}
447	Sections.push_back(x: std::move(OpcodeSec));
448
449	// Types section
450	std::vector<Embedding> TypeSec;
451	TypeSec.reserve(n: MaxCanonicalTypeIDs);
452	for (unsigned I = `0`; I < MaxCanonicalTypeIDs; ++I) {
453	TypeSec.emplace_back(args&: Dim, args&: DummyVal);
454	DummyVal += `0.1f`;
455	}
456	Sections.push_back(x: std::move(TypeSec));
457
458	// Operands section
459	std::vector<Embedding> OperandSec;
460	OperandSec.reserve(n: MaxOperandKinds);
461	for (unsigned I = `0`; I < MaxOperandKinds; ++I) {
462	OperandSec.emplace_back(args&: Dim, args&: DummyVal);
463	DummyVal += `0.1f`;
464	}
465	Sections.push_back(x: std::move(OperandSec));
466
467	// Predicates section
468	std::vector<Embedding> PredicateSec;
469	PredicateSec.reserve(n: MaxPredicateKinds);
470	for (unsigned I = `0`; I < MaxPredicateKinds; ++I) {
471	PredicateSec.emplace_back(args&: Dim, args&: DummyVal);
472	DummyVal += `0.1f`;
473	}
474	Sections.push_back(x: std::move(PredicateSec));
475
476	return VocabStorage (std::move(Sections));
477	}
478
479	namespace {
480	using VocabMap = std::map<std::string, Embedding>;
481
482	/// Read vocabulary JSON file and populate the section maps.
483	Error readVocabularyFromFile(StringRef VocabFilePath, VocabMap &OpcVocab,
484	VocabMap &TypeVocab, VocabMap &ArgVocab) {
485	auto BufOrError =
486	MemoryBuffer::getFileOrSTDIN(Filename: VocabFilePath, /IsText=/true);
487	if (!BufOrError)
488	return createFileError(F: VocabFilePath, EC: BufOrError.getError());
489
490	auto Content = BufOrError.get()->getBuffer();
491
492	Expected<json::Value> ParsedVocabValue = json::parse(JSON: Content);
493	if (!ParsedVocabValue)
494	return ParsedVocabValue.takeError();
495
496	unsigned OpcodeDim = `0`, TypeDim = `0`, ArgDim = `0`;
497	if (auto Err = VocabStorage::parseVocabSection(Key: "Opcodes", ParsedVocabValue: *ParsedVocabValue,
498	TargetVocab&: OpcVocab, Dim&: OpcodeDim))
499	return Err;
500
501	if (auto Err = VocabStorage::parseVocabSection(Key: "Types", ParsedVocabValue: *ParsedVocabValue,
502	TargetVocab&: TypeVocab, Dim&: TypeDim))
503	return Err;
504
505	if (auto Err = VocabStorage::parseVocabSection(Key: "Arguments", ParsedVocabValue: *ParsedVocabValue,
506	TargetVocab&: ArgVocab, Dim&: ArgDim))
507	return Err;
508
509	if (!(OpcodeDim == TypeDim && TypeDim == ArgDim))
510	return createStringError(EC: errc::illegal_byte_sequence,
511	S: "Vocabulary sections have different dimensions");
512
513	return Error::success();
514	}
515	} // anonymous namespace
516
517	/// Generate VocabStorage from vocabulary maps.
518	VocabStorage Vocabulary::buildVocabStorage(const VocabMap &OpcVocab,
519	const VocabMap &TypeVocab,
520	const VocabMap &ArgVocab) {
521
522	// Helper for handling missing entities in the vocabulary.
523	// Currently, we use a zero vector. In the future, we will throw an error to
524	// ensure that all* known entities are present in the vocabulary.*
525	auto handleMissingEntity = [](const std::string &Val) {
526	LLVM_DEBUG(errs() << Val
527	<< " is not in vocabulary, using zero vector; This "
528	"would result in an error in future.\n");
529	++VocabMissCounter;
530	};
531
532	unsigned Dim = OpcVocab.begin()->second.size();
533	assert(Dim > `0` && "Vocabulary dimension must be greater than zero");
534
535	// Handle Opcodes
536	std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
537	Embedding (Dim));
538	for (unsigned Opcode : seq(Begin: `0u`, End: Vocabulary::MaxOpcodes)) {
539	StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode: Opcode + `1`);
540	auto It = OpcVocab.find(x: VocabKey.str());
541	if (It != OpcVocab.end())
542	NumericOpcodeEmbeddings [Opcode] = It ->second;
543	else
544	handleMissingEntity (VocabKey.str());
545	}
546
547	// Handle Types - only canonical types are present in vocabulary
548	std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs,
549	Embedding (Dim));
550	for (unsigned CTypeID : seq(Begin: `0u`, End: Vocabulary::MaxCanonicalTypeIDs)) {
551	StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID(
552	CType: static_cast<Vocabulary::CanonicalTypeID>(CTypeID));
553	if (auto It = TypeVocab.find(x: VocabKey.str()); It != TypeVocab.end()) {
554	NumericTypeEmbeddings [CTypeID] = It ->second;
555	continue;
556	}
557	handleMissingEntity (VocabKey.str());
558	}
559
560	// Handle Arguments/Operands
561	std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds,
562	Embedding (Dim));
563	for (unsigned OpKind : seq(Begin: `0u`, End: Vocabulary::MaxOperandKinds)) {
564	Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind);
565	StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind);
566	auto It = ArgVocab.find(x: VocabKey.str());
567	if (It != ArgVocab.end()) {
568	NumericArgEmbeddings [OpKind] = It ->second;
569	continue;
570	}
571	handleMissingEntity (VocabKey.str());
572	}
573
574	// Handle Predicates: part of Operands section. We look up predicate keys
575	// in ArgVocab.
576	std::vector<Embedding> NumericPredEmbeddings(Vocabulary::MaxPredicateKinds,
577	Embedding (Dim, `0`));
578	for (unsigned PK : seq(Begin: `0u`, End: Vocabulary::MaxPredicateKinds)) {
579	StringRef VocabKey =
580	Vocabulary::getVocabKeyForPredicate(Pred: Vocabulary::getPredicate(Index: PK));
581	auto It = ArgVocab.find(x: VocabKey.str());
582	if (It != ArgVocab.end()) {
583	NumericPredEmbeddings [PK] = It ->second;
584	continue;
585	}
586	handleMissingEntity (VocabKey.str());
587	}
588
589	// Create section-based storage instead of flat vocabulary
590	// Order must match Vocabulary::Section enum
591	std::vector<std::vector<Embedding>> Sections(`4`);
592	Sections [static_cast<unsigned>(Section::Opcodes)] =
593	std::move(NumericOpcodeEmbeddings); // Section::Opcodes
594	Sections [static_cast<unsigned>(Section::CanonicalTypes)] =
595	std::move(NumericTypeEmbeddings); // Section::CanonicalTypes
596	Sections [static_cast<unsigned>(Section::Operands)] =
597	std::move(NumericArgEmbeddings); // Section::Operands
598	Sections [static_cast<unsigned>(Section::Predicates)] =
599	std::move(NumericPredEmbeddings); // Section::Predicates
600
601	// Create VocabStorage from organized sections
602	return VocabStorage (std::move(Sections));
603	}
604
605	// ==----------------------------------------------------------------------===//
606	// Vocabulary
607	//===----------------------------------------------------------------------===//
608
609	Expected<Vocabulary> Vocabulary::fromFile(StringRef VocabFilePath,
610	float OpcWeight, float TypeWeight,
611	float ArgWeight) {
612	VocabMap OpcVocab, TypeVocab, ArgVocab;
613	if (auto Err =
614	readVocabularyFromFile(VocabFilePath, OpcVocab, TypeVocab, ArgVocab))
615	return std::move(Err);
616
617	// Scale the vocabulary sections based on the provided weights
618	auto scaleVocabSection = [](VocabMap &Vocab, float Weight) {
619	for (auto &Entry : Vocab)
620	Entry.second *= Weight;
621	};
622	scaleVocabSection (OpcVocab, OpcWeight);
623	scaleVocabSection (TypeVocab, TypeWeight);
624	scaleVocabSection (ArgVocab, ArgWeight);
625
626	// Generate the numeric lookup vocabulary
627	return Vocabulary (buildVocabStorage(OpcVocab, TypeVocab, ArgVocab));
628	}
629
630	// ==----------------------------------------------------------------------===//
631	// IR2VecVocabAnalysis
632	//===----------------------------------------------------------------------===//
633
634	void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) {
635	handleAllErrors(E: std::move(Err), Handlers: [&](const ErrorInfoBase &EI) {
636	Ctx.emitError(ErrorStr: "Error reading vocabulary: " + EI.message());
637	});
638	}
639
640	IR2VecVocabAnalysis::Result
641	IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
642	auto Ctx = &M.getContext();
643	// If vocabulary is already populated by the constructor, use it.
644	if (Vocab.has_value())
645	return Vocabulary (std::move(Vocab.value()));
646
647	// Otherwise, try to read from the vocabulary file specified via CLI.
648	if (VocabFile.empty()) {
649	// FIXME: Use default vocabulary
650	Ctx->emitError(ErrorStr: "IR2Vec vocabulary file path not specified; You may need to "
651	"set it using --ir2vec-vocab-path");
652	return Vocabulary (); // Return invalid result
653	}
654
655	// Use the static factory method to load the vocabulary.
656	auto VocabOrErr =
657	Vocabulary::fromFile(VocabFilePath: VocabFile, OpcWeight, TypeWeight, ArgWeight);
658	if (!VocabOrErr) {
659	emitError(Err: VocabOrErr.takeError(), Ctx&: *Ctx);
660	return Vocabulary ();
661	}
662
663	return std::move(*VocabOrErr);
664	}
665
666	// ==----------------------------------------------------------------------===//
667	// Printer Passes
668	//===----------------------------------------------------------------------===//
669
670	PreservedAnalyses IR2VecPrinterPass::run(Module &M,
671	ModuleAnalysisManager &MAM) {
672	auto &Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(IR&: M);
673	assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid");
674
675	for (Function &F : M) {
676	auto Emb = Embedder::create(Mode: IR2VecEmbeddingKind, F, Vocab: Vocabulary);
677	if (!Emb) {
678	OS << "Error creating IR2Vec embeddings \n";
679	continue;
680	}
681
682	OS << "IR2Vec embeddings for function " << F.getName() << ":\n";
683	OS << "Function vector: ";
684	Emb ->getFunctionVector().print(OS);
685
686	OS << "Basic block vectors:\n";
687	for (const BasicBlock &BB : F) {
688	OS << "Basic block: " << BB.getName() << ":\n";
689	Emb ->getBBVector(BB).print(OS);
690	}
691
692	OS << "Instruction vectors:\n";
693	for (const BasicBlock &BB : F) {
694	for (const Instruction &I : BB) {
695	OS << "Instruction: ";
696	I.print(O&: OS);
697	Emb ->getInstVector(I).print(OS);
698	}
699	}
700	}
701	return PreservedAnalyses::all();
702	}
703
704	PreservedAnalyses IR2VecVocabPrinterPass::run(Module &M,
705	ModuleAnalysisManager &MAM) {
706	auto &IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(IR&: M);
707	assert(IR2VecVocabulary.isValid() && "IR2Vec Vocabulary is invalid");
708
709	// Print each entry
710	unsigned Pos = `0`;
711	for (const auto &Entry : IR2VecVocabulary) {
712	OS << "Key: " << IR2VecVocabulary.getStringKey(Pos: Pos++) << ": ";
713	Entry.print(OS);
714	}
715	return PreservedAnalyses::all();
716	}
717

Browse the source code of llvm_projects/llvm/lib/Analysis/IR2Vec.cpp